From 81f88f6ab674973d361b6d176aa4d3ebd32253ab Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 3 Dec 2025 19:15:07 +0000 Subject: libbpf: Add debug messaging in dedup equivalence/identity matching We have seen a number of issues like [1]; failures to deduplicate key kernel data structures like task_struct. These are often hard to debug from pahole even with verbose output, especially when identity/equivalence checks fail deep in a nested struct comparison. Here we add debug messages of the form libbpf: STRUCT 'task_struct' size=2560 vlen=194 cand_id[54222] canon_id[102820] shallow-equal but not equiv for field#23 'sched_class': 0 These will be emitted during dedup from pahole when --verbose/-V is specified. This greatly helps identify exactly where dedup failures are experienced. [1] https://lore.kernel.org/bpf/b8e8b560-bce5-414b-846d-0da6d22a9983@oracle.com/ Changes since v1: - updated debug messages to refer to shallow-equal, added ids (Andrii) Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203191507.55565-1-alan.maguire@oracle.com --- tools/lib/bpf/btf.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 84a4b0abc8be..b136572e889a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4431,11 +4431,14 @@ static bool btf_dedup_identical_types(struct btf_dedup *d, __u32 id1, __u32 id2, struct btf_type *t1, *t2; int k1, k2; recur: - if (depth <= 0) - return false; - t1 = btf_type_by_id(d->btf, id1); t2 = btf_type_by_id(d->btf, id2); + if (depth <= 0) { + pr_debug("Reached depth limit for identical type comparison for '%s'/'%s'\n", + btf__name_by_offset(d->btf, t1->name_off), + btf__name_by_offset(d->btf, t2->name_off)); + return false; + } k1 = btf_kind(t1); k2 = btf_kind(t2); @@ -4497,8 +4500,16 @@ recur: for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { if (m1->type == m2->type) continue; - if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) + if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) { + if (t1->name_off) { + pr_debug("%s '%s' size=%d vlen=%d id1[%u] id2[%u] shallow-equal but not identical for field#%d '%s'\n", + k1 == BTF_KIND_STRUCT ? "STRUCT" : "UNION", + btf__name_by_offset(d->btf, t1->name_off), + t1->size, btf_vlen(t1), id1, id2, i, + btf__name_by_offset(d->btf, m1->name_off)); + } return false; + } } return true; } @@ -4739,8 +4750,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, canon_m = btf_members(canon_type); for (i = 0; i < vlen; i++) { eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); - if (eq <= 0) + if (eq <= 0) { + if (cand_type->name_off) { + pr_debug("%s '%s' size=%d vlen=%d cand_id[%u] canon_id[%u] shallow-equal but not equiv for field#%d '%s': %d\n", + cand_kind == BTF_KIND_STRUCT ? "STRUCT" : "UNION", + btf__name_by_offset(d->btf, cand_type->name_off), + cand_type->size, vlen, cand_id, canon_id, i, + btf__name_by_offset(d->btf, cand_m->name_off), eq); + } return eq; + } cand_m++; canon_m++; } -- cgit v1.2.3 From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 17 ++++++++ kernel/bpf/bpf_struct_ops.c | 88 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 3 ++ kernel/bpf/syscall.c | 46 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 17 ++++++++ 6 files changed, 187 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..28d8d6b7bb1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1739,6 +1739,8 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + struct mutex st_ops_assoc_mutex; + struct bpf_map __rcu *st_ops_assoc; }; struct bpf_prog { @@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET @@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + return -EOPNOTSUPP; +} +static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ +} +static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + return NULL; +} static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 278490683d28..c43346cb3d76 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_prog_disassoc_struct_ops(st_map->links[i]->prog); + } +} + static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; @@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); @@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_dissoc_progs(st_map); + bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. @@ -1396,6 +1412,78 @@ err_out: return err; } +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (st_ops_assoc && st_ops_assoc == map) + return 0; + + if (st_ops_assoc) { + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EBUSY; + + rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON); + } else { + /* + * struct_ops map does not track associated non-struct_ops programs. + * Bump the refcount to make sure st_ops_assoc is always valid. + */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_inc(map); + + rcu_assign_pointer(prog->aux->st_ops_assoc, map); + } + + return 0; +} + +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_put(st_ops_assoc); + + RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL); +} + +/* + * Get a reference to the struct_ops struct (i.e., kdata) associated with a + * program. Should only be called in BPF program context (e.g., in a kfunc). + * + * If the returned pointer is not NULL, it must points to a valid struct_ops. + * The struct_ops map is not guaranteed to be initialized nor attached. + * Kernel struct_ops implementers are responsible for tracking and checking + * the state of the struct_ops if the use case requires an initialized or + * attached struct_ops. + */ +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + struct bpf_struct_ops_map *st_map; + struct bpf_map *st_ops_assoc; + + st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held()); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return NULL; + + st_map = (struct bpf_struct_ops_map *)st_ops_assoc; + + return &st_map->kvalue.data; +} +EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops); + void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c8ae6ab31651..67226145a4db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); + mutex_init(&fp->aux->st_ops_assoc_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); @@ -286,6 +287,7 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); + mutex_destroy(&fp->aux->st_ops_assoc_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } @@ -2896,6 +2898,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); + bpf_prog_disassoc_struct_ops(aux->prog); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6589acc89ef8..3080cc48bfc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6122,6 +6122,49 @@ static int prog_stream_read(union bpf_attr *attr) return ret; } +#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd + +static int prog_assoc_struct_ops(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) + return -EINVAL; + + if (attr->prog_assoc_struct_ops.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_prog; + } + + map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto put_prog; + } + + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_map; + } + + ret = bpf_prog_assoc_struct_ops(prog, map); + +put_map: + bpf_map_put(map); +put_prog: + bpf_prog_put(prog); + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6261,6 +6304,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_STREAM_READ_BY_FD: err = prog_stream_read(&attr); break; + case BPF_PROG_ASSOC_STRUCT_OPS: + err = prog_assoc_struct_ops(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index be7d8e060e10..6b92b0847ec2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 87cd177b149a5d86103736994307c25999e0be4b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:45 -0800 Subject: libbpf: Add support for associating BPF program with struct_ops Add low-level wrapper and libbpf API for BPF_PROG_ASSOC_STRUCT_OPS command in the bpf() syscall. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-4-ameryhung@gmail.com --- tools/lib/bpf/bpf.c | 19 +++++++++++++++++++ tools/lib/bpf/bpf.h | 21 +++++++++++++++++++++ tools/lib/bpf/libbpf.c | 31 +++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 16 ++++++++++++++++ tools/lib/bpf/libbpf.map | 2 ++ 5 files changed, 89 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index b66f5fbfbbb2..21b57a629916 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -1397,3 +1397,22 @@ int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len, err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz); return libbpf_err_errno(err); } + +int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd, + struct bpf_prog_assoc_struct_ops_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops); + union bpf_attr attr; + int err; + + if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_assoc_struct_ops.map_fd = map_fd; + attr.prog_assoc_struct_ops.prog_fd = prog_fd; + attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0); + + err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz); + return libbpf_err_errno(err); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index e983a3e40d61..1f9c28d27795 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -733,6 +733,27 @@ struct bpf_prog_stream_read_opts { LIBBPF_API int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len, struct bpf_prog_stream_read_opts *opts); +struct bpf_prog_assoc_struct_ops_opts { + size_t sz; + __u32 flags; + size_t :0; +}; +#define bpf_prog_assoc_struct_ops_opts__last_field flags + +/** + * @brief **bpf_prog_assoc_struct_ops** associates a BPF program with a + * struct_ops map. + * + * @param prog_fd FD for the BPF program + * @param map_fd FD for the struct_ops map to be associated with the BPF program + * @param opts optional options, can be NULL + * + * @return 0 on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd, + struct bpf_prog_assoc_struct_ops_opts *opts); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3dc8a8078815..c7c79014d46c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -14133,6 +14133,37 @@ int bpf_program__set_attach_target(struct bpf_program *prog, return 0; } +int bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map, + struct bpf_prog_assoc_struct_ops_opts *opts) +{ + int prog_fd, map_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't associate BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + pr_warn("prog '%s': can't associate struct_ops program\n", prog->name); + return libbpf_err(-EINVAL); + } + + map_fd = bpf_map__fd(map); + if (map_fd < 0) { + pr_warn("map '%s': can't associate BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + + if (!bpf_map__is_struct_ops(map)) { + pr_warn("map '%s': can't associate non-struct_ops map\n", map->name); + return libbpf_err(-EINVAL); + } + + return bpf_prog_assoc_struct_ops(prog_fd, map_fd, opts); +} + int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) { int err = 0, n, len, start, end = -1; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 65e68e964b89..e14d9e349f9c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1006,6 +1006,22 @@ LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name); +struct bpf_prog_assoc_struct_ops_opts; /* defined in bpf.h */ + +/** + * @brief **bpf_program__assoc_struct_ops()** associates a BPF program with a + * struct_ops map. + * + * @param prog BPF program + * @param map struct_ops map to be associated with the BPF program + * @param opts optional options, can be NULL + * + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int +bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map, + struct bpf_prog_assoc_struct_ops_opts *opts); + /** * @brief **bpf_object__find_map_by_name()** returns BPF map of * the given name, if it exists within the passed BPF object diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 8ed8749907d4..84fb90a016c9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -451,4 +451,6 @@ LIBBPF_1.7.0 { global: bpf_map__set_exclusive_program; bpf_map__exclusive_program; + bpf_prog_assoc_struct_ops; + bpf_program__assoc_struct_ops; } LIBBPF_1.6.0; -- cgit v1.2.3 From 33a165f9c2c1b9ddceaaccc356ce841baf1a08a2 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:46 -0800 Subject: selftests/bpf: Test BPF_PROG_ASSOC_STRUCT_OPS command Test BPF_PROG_ASSOC_STRUCT_OPS command that associates a BPF program with a struct_ops. The test follows the same logic in commit ba7000f1c360 ("selftests/bpf: Test multi_st_ops and calling kfuncs from different programs"), but instead of using map id to identify a specific struct_ops, this test uses the new BPF command to associate a struct_ops with a program. The test consists of two sets of almost identical struct_ops maps and BPF programs associated with the map. Their only difference is the unique value returned by bpf_testmod_multi_st_ops::test_1(). The test first loads the programs and associates them with struct_ops maps. Then, it exercises the BPF programs. They will in turn call kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() to trigger test_1() of the associated struct_ops map, and then check if the right unique value is returned. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-5-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 72 ++++++++++++++ .../testing/selftests/bpf/progs/struct_ops_assoc.c | 105 +++++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 17 ++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 1 + 4 files changed, 195 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c new file mode 100644 index 000000000000..1e24a4915524 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "struct_ops_assoc.skel.h" + +static void test_st_ops_assoc(void) +{ + struct struct_ops_assoc *skel = NULL; + int err, pid; + + skel = struct_ops_assoc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc__open")) + goto out; + + /* cannot explicitly associate struct_ops program */ + err = bpf_program__assoc_struct_ops(skel->progs.test_1_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(test_1_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_b, st_ops_map_b)"); + + /* sys_enter_prog_a already associated with map_a */ + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_b, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_b)"); + + err = struct_ops_assoc__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel->bss->test_pid = pid; + sys_gettid(); + skel->bss->test_pid = 0; + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc__destroy(skel); +} + +void test_struct_ops_assoc(void) +{ + if (test__start_subtest("st_ops_assoc")) + test_st_ops_assoc(); +} diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c new file mode 100644 index 000000000000..8f1097903e22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +int test_pid; + +/* Programs associated with st_ops_map_a */ + +#define MAP_A_MAGIC 1234 +int test_err_a; + +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + return MAP_A_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +#define MAP_B_MAGIC 5678 +int test_err_b; + +SEC("struct_ops") +int BPF_PROG(test_1_b, struct st_ops_args *args) +{ + return MAP_B_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_b, +}; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1669a7eeda26..90c4b1a51de6 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1134,6 +1134,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) @@ -1176,6 +1177,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABL BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1637,6 +1639,7 @@ static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) return NULL; } +/* Call test_1() of the struct_ops map identified by the id */ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) { struct bpf_testmod_multi_st_ops *st_ops; @@ -1652,6 +1655,20 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) return ret; } +/* Call test_1() of the associated struct_ops map */ +int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) +{ + struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog; + struct bpf_testmod_multi_st_ops *st_ops; + int ret = -1; + + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux); + if (st_ops) + ret = st_ops->test_1(args); + + return ret; +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 4df6fa6a92cb..2357a0340ffe 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -162,5 +162,6 @@ struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; +int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym; #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 04fd12df4e05dd6fd3017b637f6fbc9da10b4e65 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:47 -0800 Subject: selftests/bpf: Test ambiguous associated struct_ops Add a test to make sure implicit struct_ops association does not break backward compatibility nor return incorrect struct_ops. struct_ops programs should still be allowed to be reused in different struct_ops map. The associated struct_ops map set implicitly however will be poisoned. Trying to read it through the helper bpf_prog_get_assoc_struct_ops() should result in a NULL pointer. While recursion of test_1() cannot happen due to the associated struct_ops being ambiguois, explicitly check for it to prevent stack overflow if the test regresses. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-6-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 38 +++++++++++ .../selftests/bpf/progs/struct_ops_assoc_reuse.c | 75 ++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c index 1e24a4915524..02173504f675 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -2,6 +2,7 @@ #include #include "struct_ops_assoc.skel.h" +#include "struct_ops_assoc_reuse.skel.h" static void test_st_ops_assoc(void) { @@ -65,8 +66,45 @@ out: struct_ops_assoc__destroy(skel); } +static void test_st_ops_assoc_reuse(void) +{ + struct struct_ops_assoc_reuse *skel = NULL; + int err; + + skel = struct_ops_assoc_reuse__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_reuse__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = struct_ops_assoc_reuse__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc_reuse__destroy(skel); +} + void test_struct_ops_assoc(void) { if (test__start_subtest("st_ops_assoc")) test_st_ops_assoc(); + if (test__start_subtest("st_ops_assoc_reuse")) + test_st_ops_assoc_reuse(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c new file mode 100644 index 000000000000..5bb6ebf5eed4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define MAP_A_MAGIC 1234 +int test_err_a; +int recur; + +/* + * test_1_a is reused. The kfunc should not be able to get the associated + * struct_ops and call test_1 recursively as it is ambiguous. + */ +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + int ret; + + if (!recur) { + recur++; + ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL); + if (ret != -1) + test_err_a++; + recur--; + } + + return MAP_A_MAGIC; +} + +/* Programs associated with st_ops_map_a */ + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +int test_err_b; + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_a, +}; -- cgit v1.2.3 From 0e841d19263ab6e1ca2b280109832f57624e48d1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:48 -0800 Subject: selftests/bpf: Test getting associated struct_ops in timer callback Make sure 1) a timer callback can also reference the associated struct_ops, and then make sure 2) the timer callback cannot get a dangled pointer to the struct_ops when the map is freed. The test schedules a timer callback from a struct_ops program since struct_ops programs do not pin the map. It is possible for the timer callback to run after the map is freed. The timer callback calls a kfunc that runs .test_1() of the associated struct_ops, which should return MAP_MAGIC when the map is still alive or -1 when the map is gone. The first subtest added in this patch schedules the timer callback to run immediately, while the map is still alive. The second subtest added schedules the callback to run 500ms after syscall_prog runs and then frees the map right after syscall_prog runs. Both subtests then wait until the callback runs to check the return of the kfunc. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-7-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 81 ++++++++++++++++++++++ .../bpf/progs/struct_ops_assoc_in_timer.c | 77 ++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c index 02173504f675..461ded722351 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -3,6 +3,7 @@ #include #include "struct_ops_assoc.skel.h" #include "struct_ops_assoc_reuse.skel.h" +#include "struct_ops_assoc_in_timer.skel.h" static void test_st_ops_assoc(void) { @@ -101,10 +102,90 @@ out: struct_ops_assoc_reuse__destroy(skel); } +static void test_st_ops_assoc_in_timer(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + err = struct_ops_assoc_in_timer__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again + * immediately. + */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, 1234, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + +static void test_st_ops_assoc_in_timer_no_uref(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + struct bpf_link *link; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + link = bpf_map__attach_struct_ops(skel->maps.st_ops_map); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again. + * timer_cb will run 500ms after syscall_prog runs, when the user space no longer + * holds a reference to st_ops_map. + */ + skel->bss->timer_ns = 500000000; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Detach and close struct_ops map to cause it to be freed */ + bpf_link__destroy(link); + close(bpf_program__fd(skel->progs.syscall_prog)); + close(bpf_map__fd(skel->maps.st_ops_map)); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, -1, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + void test_struct_ops_assoc(void) { if (test__start_subtest("st_ops_assoc")) test_st_ops_assoc(); if (test__start_subtest("st_ops_assoc_reuse")) test_st_ops_assoc_reuse(); + if (test__start_subtest("st_ops_assoc_in_timer")) + test_st_ops_assoc_in_timer(); + if (test__start_subtest("st_ops_assoc_in_timer_no_uref")) + test_st_ops_assoc_in_timer_no_uref(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c new file mode 100644 index 000000000000..d5a2ea934284 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array_map SEC(".maps"); + +#define MAP_MAGIC 1234 +int recur; +int test_err; +int timer_ns; +int timer_test_1_ret; +int timer_cb_run; + +__noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + struct st_ops_args args = {}; + + recur++; + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + recur--; + + timer_cb_run++; + + return 0; +} + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + struct bpf_timer *timer; + int key = 0; + + if (!recur) { + timer = bpf_map_lookup_elem(&array_map, &key); + if (!timer) + return 0; + + bpf_timer_init(timer, &array_map, 1); + bpf_timer_set_callback(timer, timer_cb); + bpf_timer_start(timer, timer_ns, 0); + } + + return MAP_MAGIC; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; -- cgit v1.2.3 From 311ead1be05d2348e89f873337c8375e856e1abb Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 3 Dec 2025 19:56:29 +0800 Subject: selftests: cgroup: Add cg_read_key_long_poll() to poll a cgroup key with retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new helper function `cg_read_key_long_poll()` in cgroup_util.h. This function polls the specified key in a cgroup file until it matches the expected value or the retry limit is reached, with configurable wait intervals between retries. This helper is particularly useful for handling asynchronously updated cgroup statistics (e.g., memory.stat), where immediate reads may observe stale values, especially on busy systems. It allows tests and other utilities to handle such cases more flexibly. Signed-off-by: Guopeng Zhang Suggested-by: Michal Koutný Reviewed-by: Shakeel Butt Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/lib/cgroup_util.c | 21 +++++++++++++++++++++ .../selftests/cgroup/lib/include/cgroup_util.h | 5 +++++ 2 files changed, 26 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 44c52f620fda..ce6c2642fd9b 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -168,6 +168,27 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) return atol(ptr + strlen(key)); } +long cg_read_key_long_poll(const char *cgroup, const char *control, + const char *key, long expected, int retries, + useconds_t wait_interval_us) +{ + long val = -1; + int i; + + for (i = 0; i < retries; i++) { + val = cg_read_key_long(cgroup, control, key); + if (val < 0) + return val; + + if (val == expected) + break; + + usleep(wait_interval_us); + } + + return val; +} + long cg_read_lc(const char *cgroup, const char *control) { char buf[PAGE_SIZE]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index 7ab2824ed7b5..77f386dab5e8 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -17,6 +17,8 @@ #define CG_NAMED_NAME "selftest" #define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) +#define DEFAULT_WAIT_INTERVAL_US (100 * 1000) /* 100 ms */ + /* * Checks if two given values differ by less than err% of their sum. */ @@ -64,6 +66,9 @@ extern int cg_read_strstr(const char *cgroup, const char *control, extern long cg_read_long(const char *cgroup, const char *control); extern long cg_read_long_fd(int fd); long cg_read_key_long(const char *cgroup, const char *control, const char *key); +long cg_read_key_long_poll(const char *cgroup, const char *control, + const char *key, long expected, int retries, + useconds_t wait_interval_us); extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); extern int cg_open(const char *cgroup, const char *control, int flags); -- cgit v1.2.3 From 6360d444ae32871c6a048ac880ef3b871a439bad Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 3 Dec 2025 19:56:30 +0800 Subject: selftests: cgroup: make test_memcg_sock robust against delayed sock stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_memcg_sock() currently requires that memory.stat's "sock " counter is exactly zero immediately after the TCP server exits. On a busy system this assumption is too strict: - Socket memory may be freed with a small delay (e.g. RCU callbacks). - memcg statistics are updated asynchronously via the rstat flushing worker, so the "sock " value in memory.stat can stay non-zero for a short period of time even after all socket memory has been uncharged. As a result, test_memcg_sock() can intermittently fail even though socket memory accounting is working correctly. Make the test more robust by polling memory.stat for the "sock " counter and allowing it some time to drop to zero instead of checking it only once. The timeout is set to 3 seconds to cover the periodic rstat flush interval (FLUSH_TIME = 2*HZ by default) plus some scheduling slack. If the counter does not become zero within the timeout, the test still fails as before. On my test system, running test_memcontrol 50 times produced: - Before this patch: 6/50 runs passed. - After this patch: 50/50 runs passed. Signed-off-by: Guopeng Zhang Suggested-by: Lance Yang Reviewed-by: Shakeel Butt Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_memcontrol.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 4e1647568c5b..2fb096a2a9f9 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -21,6 +21,8 @@ #include "kselftest.h" #include "cgroup_util.h" +#define MEMCG_SOCKSTAT_WAIT_RETRIES 30 + static bool has_localevents; static bool has_recursiveprot; @@ -1384,6 +1386,7 @@ static int test_memcg_sock(const char *root) int bind_retries = 5, ret = KSFT_FAIL, pid, err; unsigned short port; char *memcg; + long sock_post = -1; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -1432,7 +1435,22 @@ static int test_memcg_sock(const char *root) if (cg_read_long(memcg, "memory.current") < 0) goto cleanup; - if (cg_read_key_long(memcg, "memory.stat", "sock ")) + /* + * memory.stat is updated asynchronously via the memcg rstat + * flushing worker, which runs periodically (every 2 seconds, + * see FLUSH_TIME). On a busy system, the "sock " counter may + * stay non-zero for a short period of time after the TCP + * connection is closed and all socket memory has been + * uncharged. + * + * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some + * scheduling slack) and require that the "sock " counter + * eventually drops to zero. + */ + sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0, + MEMCG_SOCKSTAT_WAIT_RETRIES, + DEFAULT_WAIT_INTERVAL_US); + if (sock_post) goto cleanup; ret = KSFT_PASS; -- cgit v1.2.3 From 50133c09d189a26f4cc6e78e382864fd599a1dc4 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 3 Dec 2025 19:56:31 +0800 Subject: selftests: cgroup: Replace sleep with cg_read_key_long_poll() for waiting on nr_dying_descendants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the manual sleep-and-retry logic in test_kmem_dead_cgroups() with the new helper `cg_read_key_long_poll()`. This change improves the robustness of the test by polling the "nr_dying_descendants" counter in `cgroup.stat` until it reaches 0 or the timeout is exceeded. Additionally, increase the retry timeout to 8 seconds (from 5 seconds) based on testing results: - With 5-second timeout: 4/20 runs passed. - With 8-second timeout: 20/20 runs passed. The 8 second timeout is based on stress testing of test_kmem_dead_cgroups() under load: 5 seconds was occasionally not enough for reclaim of dying descendants to complete, whereas 8 seconds consistently covered the observed latencies. This value is intended as a generous upper bound for the asynchronous reclaim and is not tied to any specific kernel constant, so it can be adjusted in the future if reclaim behavior changes. Signed-off-by: Guopeng Zhang Reviewed-by: Shakeel Butt Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_kmem.c | 33 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index ca38525484e3..eeabd34bf083 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -26,6 +26,7 @@ */ #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) +#define KMEM_DEAD_WAIT_RETRIES 80 static int alloc_dcache(const char *cgroup, void *arg) { @@ -306,9 +307,7 @@ static int test_kmem_dead_cgroups(const char *root) { int ret = KSFT_FAIL; char *parent; - long dead; - int i; - int max_time = 20; + long dead = -1; parent = cg_name(root, "kmem_dead_cgroups_test"); if (!parent) @@ -323,21 +322,19 @@ static int test_kmem_dead_cgroups(const char *root) if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) goto cleanup; - for (i = 0; i < max_time; i++) { - dead = cg_read_key_long(parent, "cgroup.stat", - "nr_dying_descendants "); - if (dead == 0) { - ret = KSFT_PASS; - break; - } - /* - * Reclaiming cgroups might take some time, - * let's wait a bit and repeat. - */ - sleep(1); - if (i > 5) - printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); - } + /* + * Allow up to ~8s for reclaim of dying descendants to complete. + * This is a generous upper bound derived from stress testing, not + * from a specific kernel constant, and can be adjusted if reclaim + * behavior changes in the future. + */ + dead = cg_read_key_long_poll(parent, "cgroup.stat", + "nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES, + DEFAULT_WAIT_INTERVAL_US); + if (dead) + goto cleanup; + + ret = KSFT_PASS; cleanup: cg_destroy(parent); -- cgit v1.2.3 From 18352f8fae91d23bbd7165a7b2a1f15c4f5beff8 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Mon, 8 Dec 2025 22:14:32 +0900 Subject: selftests/bpf: add tests for attaching invalid fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test cases for situations where adding the following types of file descriptors to a cpumap entry should fail: - Non-BPF file descriptor (expect -EINVAL) - Nonexistent file descriptor (expect -EBADF) Also tighten the assertion for the expected error when adding a non-BPF_XDP_CPUMAP program to a cpumap entry. Signed-off-by: Kohei Enju Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20251208131449.73036-3-enjuk@amazon.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_cpumap_attach.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index df27535995af..ad56e4370ce3 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -18,7 +18,7 @@ static void test_xdp_with_cpumap_helpers(void) struct bpf_cpumap_val val = { .qsize = 192, }; - int err, prog_fd, prog_redir_fd, map_fd; + int err, prog_fd, prog_redir_fd, map_fd, bad_fd; struct nstoken *nstoken = NULL; __u32 idx = 0; @@ -79,7 +79,22 @@ static void test_xdp_with_cpumap_helpers(void) val.qsize = 192; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + ASSERT_EQ(err, -EINVAL, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + + /* Try to attach non-BPF file descriptor */ + bad_fd = open("/dev/null", O_RDONLY); + ASSERT_GE(bad_fd, 0, "Open /dev/null for non-BPF fd"); + + val.bpf_prog.fd = bad_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EINVAL, "Add non-BPF fd to cpumap entry"); + + /* Try to attach nonexistent file descriptor */ + err = close(bad_fd); + ASSERT_EQ(err, 0, "Close non-BPF fd for nonexistent fd"); + + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EBADF, "Add nonexistent fd to cpumap entry"); /* Try to attach BPF_XDP program with frags to cpumap when we have * already loaded a BPF_XDP program on the map -- cgit v1.2.3 From a5b4867fad18e72fd5fc442c16be83723776283b Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 2 Dec 2025 18:02:20 +0000 Subject: selftests/bpf: add verifier sign extension bound computation tests. This commit adds 3 tests to verify a common compiler generated pattern for sign extension (r1 <<= 32; r1 s>>= 32). The tests make sure the register bounds are correctly computed both for positive and negative register values. Signed-off-by: Cupertino Miranda Signed-off-by: Andrew Pinski Acked-by: Eduard Zingerman Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20251202180220.11128-3-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_subreg.c | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 8613ea160dcd..b3e1c3eef9ae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -531,6 +531,74 @@ __naked void arsh32_imm_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__description("arsh32 imm sign positive extend check") +__success __retval(0) +__log_level(2) +__msg("2: (57) r6 &= 4095 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__msg("3: (67) r6 <<= 32 ; R6=scalar(smin=smin32=0,smax=umax=0xfff00000000,smax32=umax32=0,var_off=(0x0; 0xfff00000000))") +__msg("4: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__naked void arsh32_imm_sign_extend_positive_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign negative extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__naked void arsh32_imm_sign_extend_negative_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__naked void arsh32_imm_sign_extend_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 2047; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("end16 (to_le) reg zero extend check") __success __success_unpriv __retval(0) -- cgit v1.2.3 From 0c82fdbbbfbee878a0a5e884ea5f31dd16138f0d Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Sat, 29 Nov 2025 14:41:22 +0530 Subject: selftests: statmount: tests for STATMOUNT_BY_FD Add tests for STATMOUNT_BY_FD flag, which adds support for passing a file descriptors to statmount(). The fd can also be on a "unmounted" mount (mount unmounted with MNT_DETACH), we also include tests for that. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Bhavik Sachdev Link: https://patch.msgid.link/20251129091455.757724-4-b.sachdev1904@gmail.com Signed-off-by: Christian Brauner --- .../selftests/filesystems/statmount/statmount.h | 15 +- .../filesystems/statmount/statmount_test.c | 261 +++++++++++++++++++-- .../filesystems/statmount/statmount_test_ns.c | 101 +++++++- 3 files changed, 354 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index 99e5ad082fb1..e1cba4bfd8d9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -43,19 +43,24 @@ #endif #endif -static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, - struct statmount *buf, size_t bufsize, +static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint32_t fd, + uint64_t mask, struct statmount *buf, size_t bufsize, unsigned int flags) { struct mnt_id_req req = { .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, .param = mask, }; - if (mnt_ns_id) { + if (flags & STATMOUNT_BY_FD) { req.size = MNT_ID_REQ_SIZE_VER1; - req.mnt_ns_id = mnt_ns_id; + req.mnt_fd = fd; + } else { + req.mnt_id = mnt_id; + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } } return syscall(__NR_statmount, &req, buf, bufsize, flags); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 6e53430423d2..a04bcaace126 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -33,15 +33,24 @@ static const char *const known_fs[] = { "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) +static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags) { size_t bufsize = 1 << 15; - struct statmount *buf = NULL, *tmp = alloca(bufsize); + struct statmount *buf = NULL, *tmp = NULL; int tofree = 0; int ret; + if (flags & STATMOUNT_BY_FD && fd < 0) + return NULL; + + tmp = alloca(bufsize); + for (;;) { - ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); + if (flags & STATMOUNT_BY_FD) + ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags); + else + ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags); + if (ret != -1) break; if (tofree) @@ -237,7 +246,7 @@ static void test_statmount_zero_mask(void) struct statmount sm; int ret; - ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, 0, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount zero mask: %s\n", strerror(errno)); @@ -263,7 +272,7 @@ static void test_statmount_mnt_basic(void) int ret; uint64_t mask = STATMOUNT_MNT_BASIC; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount mnt basic: %s\n", strerror(errno)); @@ -323,7 +332,7 @@ static void test_statmount_sb_basic(void) struct statx sx; struct statfs sf; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount sb basic: %s\n", strerror(errno)); @@ -375,7 +384,7 @@ static void test_statmount_mnt_point(void) { struct statmount *sm; - sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_POINT, 0); if (!sm) { ksft_test_result_fail("statmount mount point: %s\n", strerror(errno)); @@ -405,7 +414,7 @@ static void test_statmount_mnt_root(void) assert(last_dir); last_dir++; - sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT, 0); if (!sm) { ksft_test_result_fail("statmount mount root: %s\n", strerror(errno)); @@ -438,7 +447,7 @@ static void test_statmount_fs_type(void) const char *fs_type; const char *const *s; - sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); if (!sm) { ksft_test_result_fail("statmount fs type: %s\n", strerror(errno)); @@ -467,7 +476,7 @@ static void test_statmount_mnt_opts(void) char *line = NULL; size_t len = 0; - sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, 0); if (!sm) { ksft_test_result_fail("statmount mnt opts: %s\n", @@ -557,7 +566,7 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) uint32_t start, i; int ret; - sm = statmount_alloc(root_id, mask, 0); + sm = statmount_alloc(root_id, 0, mask, 0); if (!sm) { ksft_test_result_fail("statmount %s: %s\n", name, strerror(errno)); @@ -586,14 +595,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) exactsize = sm->size; shortsize = sizeof(*sm) + i; - ret = statmount(root_id, 0, mask, sm, exactsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, exactsize, 0); if (ret == -1) { ksft_test_result_fail("statmount exact size: %s\n", strerror(errno)); goto out; } errno = 0; - ret = statmount(root_id, 0, mask, sm, shortsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, shortsize, 0); if (ret != -1 || errno != EOVERFLOW) { ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", strerror(errno)); @@ -658,6 +667,226 @@ static void test_listmount_tree(void) ksft_test_result_pass("listmount tree\n"); } +static void test_statmount_by_fd(void) +{ + struct statmount *sm = NULL; + char tmpdir[] = "/statmount.fd.XXXXXX"; + const char root[] = "/test"; + char subdir[PATH_MAX], tmproot[PATH_MAX]; + int fd; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + snprintf(tmproot, PATH_MAX, "%s/%s", tmpdir, "chroot"); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, NULL, MS_BIND, 0)) { + ksft_perror("mount"); + goto err_subdir; + } + + if (mkdir(tmproot, 0755)) { + ksft_perror("mkdir"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_tmproot; + } + + if (chroot(tmproot)) { + ksft_perror("chroot"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_chroot; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_chroot; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in statmount\n"); + goto err_chroot; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto err_chroot; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_chroot; + } + + if (chroot(".")) { + ksft_perror("chroot"); + goto out; + } + + free(sm); + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_fd; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_POINT)) { + ksft_test_result_fail("STATMOUNT_MNT_POINT not set in statmount\n"); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto out; + } + + if (strcmp(subdir, sm->str + sm->mnt_point) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_point," + "statmount mnt_point: %s != %s\n", sm->str + sm->mnt_point, subdir); + goto out; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", sm->str + sm->mnt_root, root); + goto out; + } + + ksft_test_result_pass("statmount by fd\n"); + goto out; +err_chroot: + chroot("."); +out: + free(sm); +err_fd: + close(fd); +err_tmproot: + rmdir(tmproot); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + +static void test_statmount_by_fd_unmounted(void) +{ + const char root[] = "/test.unmounted"; + char tmpdir[] = "/statmount.fd.XXXXXX"; + char subdir[PATH_MAX]; + int fd; + struct statmount *sm = NULL; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, 0, MS_BIND, NULL)) { + ksft_perror("mount"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_subdir; + } + + if (umount2(tmpdir, MNT_DETACH)) { + ksft_perror("umount2"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd unmounted: %s\n", + strerror(errno)); + goto err_sm; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_sm; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in mask\n"); + goto err_sm; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in mask\n"); + goto err_sm; + } + + if (strcmp(sm->str + sm->mnt_root, root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_sm; + } + + ksft_test_result_pass("statmount by fd on unmounted mount\n"); +err_sm: + free(sm); +err_fd: + close(fd); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t)) int main(void) @@ -669,14 +898,14 @@ int main(void) ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); setup_namespace(); - ksft_set_plan(15); + ksft_set_plan(17); test_listmount_empty_root(); test_statmount_zero_mask(); test_statmount_mnt_basic(); @@ -693,6 +922,8 @@ int main(void) test_statmount_string(all_mask, str_off(fs_type), "fs type & all"); test_listmount_tree(); + test_statmount_by_fd_unmounted(); + test_statmount_by_fd(); if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index d56d4103182f..063d9de46431 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -102,7 +102,7 @@ static int _test_statmount_mnt_ns_id(void) if (!root_id) return NSID_ERROR; - ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret == -1) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); return NSID_ERROR; @@ -128,6 +128,98 @@ static int _test_statmount_mnt_ns_id(void) return NSID_PASS; } +static int _test_statmount_mnt_ns_id_by_fd(void) +{ + struct statmount sm; + uint64_t mnt_ns_id; + int ret, fd, mounted = 1, status = NSID_ERROR; + char mnt[] = "/statmount.fd.XXXXXX"; + + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); + if (ret != NSID_PASS) + return ret; + + if (!mkdtemp(mnt)) { + ksft_print_msg("statmount by fd mnt ns id mkdtemp: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (mount(mnt, mnt, NULL, MS_BIND, 0)) { + ksft_print_msg("statmount by fd mnt ns id mount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto err; + } + + fd = open(mnt, O_PATH); + if (fd < 0) { + ksft_print_msg("statmount by fd mnt ns id open: %s\n", strerror(errno)); + goto err; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + status = NSID_SKIP; + goto out; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + status = NSID_FAIL; + goto out; + } + + mounted = 0; + if (umount2(mnt, MNT_DETACH)) { + ksft_print_msg("statmount by fd mnt ns id umount2: %s\n", strerror(errno)); + goto out; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + + if (sm.mask == STATMOUNT_MNT_NS_ID) { + ksft_print_msg("unexpected STATMOUNT_MNT_NS_ID in mask\n"); + status = NSID_FAIL; + goto out; + } + + status = NSID_PASS; +out: + close(fd); + if (mounted) + umount2(mnt, MNT_DETACH); +err: + rmdir(mnt); + return status; +} + + static void test_statmount_mnt_ns_id(void) { pid_t pid; @@ -148,6 +240,9 @@ static void test_statmount_mnt_ns_id(void) if (ret != NSID_PASS) exit(ret); ret = _test_statmount_mnt_ns_id(); + if (ret != NSID_PASS) + exit(ret); + ret = _test_statmount_mnt_ns_id_by_fd(); exit(ret); } @@ -179,7 +274,7 @@ static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) for (int i = 0; i < nr_mounts; i++) { struct statmount sm; - ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, + ret = statmount(list[i], mnt_ns_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret < 0) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); @@ -275,7 +370,7 @@ int main(void) int ret; ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); -- cgit v1.2.3 From 6b401a5b2d2acf56ec902f96f6381982457ab339 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 2 Dec 2025 10:10:12 +0530 Subject: cpupower: idle_monitor: fix incorrect value logged after stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpuidle sysfs monitor printed the previous sample’s counter value in cpuidle_stop() instead of the freshly read one. The dprint line used previous_count[cpu][state] while current_count[cpu][state] had just been populated. This caused misleading debug output. Switch the logging to current_count so the post-interval snapshot matches the displayed value. Link: https://lore.kernel.org/r/20251202044012.3844790-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c index 8b42c2f0a5b0..4225eff9833d 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -70,7 +70,7 @@ static int cpuidle_stop(void) current_count[cpu][state] = cpuidle_state_time(cpu, state); dprint("CPU %d - State: %d - Val: %llu\n", - cpu, state, previous_count[cpu][state]); + cpu, state, current_count[cpu][state]); } } return 0; -- cgit v1.2.3 From 24858a84163c8d04827166b3bcaed80612bb62fc Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Wed, 26 Nov 2025 14:46:13 +0530 Subject: tools/cpupower: Fix inverted APERF capability check The capability check was inverted, causing the function to return error when APERF support is available and proceed when it is not. Negate the condition to return error only when APERF capability is absent. Link: https://lore.kernel.org/r/20251126091613.567480-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 7d3732f5f2f6..5fe01e516817 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -270,7 +270,7 @@ static int get_freq_hardware(unsigned int cpu, unsigned int human) { unsigned long freq; - if (cpupower_cpu_info.caps & CPUPOWER_CAP_APERF) + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) return -EINVAL; freq = cpufreq_get_freq_hardware(cpu); -- cgit v1.2.3 From 1b9aaf36b7b40235e5a529c15848c3d866362207 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Thu, 27 Nov 2025 10:15:36 +0530 Subject: tools/cpupower: Use strcspn() to strip trailing newline Replace manual newline removal with strcspn() which is safer and cleaner. This avoids potential out-of-bounds access on empty strings and handles the case where no newline exists. Link: https://lore.kernel.org/r/20251127044536.715722-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpuidle.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index f2c1139adf71..6a881d93d2e9 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -193,8 +193,7 @@ static char *cpuidle_state_get_one_string(unsigned int cpu, if (result == NULL) return NULL; - if (result[strlen(result) - 1] == '\n') - result[strlen(result) - 1] = '\0'; + result[strcspn(result, "\n")] = '\0'; return result; } @@ -366,8 +365,7 @@ static char *sysfs_cpuidle_get_one_string(enum cpuidle_string which) if (result == NULL) return NULL; - if (result[strlen(result) - 1] == '\n') - result[strlen(result) - 1] = '\0'; + result[strcspn(result, "\n")] = '\0'; return result; } -- cgit v1.2.3 From f9bd3762cf1bd0c2465f2e6121b340883471d1bf Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Mon, 1 Dec 2025 17:47:45 +0530 Subject: tools/power cpupower: Reset errno before strtoull() cpuidle_state_get_one_value() never cleared errno before calling strtoull(), so a prior ERANGE caused every cpuidle counter read to return zero. Reset errno to 0 before the conversion so each sysfs read is evaluated independently. Link: https://lore.kernel.org/r/20251201121745.3776703-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpuidle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index 6a881d93d2e9..2fcb343d8e75 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -150,6 +150,7 @@ unsigned long long cpuidle_state_get_one_value(unsigned int cpu, if (len == 0) return 0; + errno = 0; value = strtoull(linebuf, &endp, 0); if (endp == linebuf || errno == ERANGE) -- cgit v1.2.3 From ff72619e11348ab189e232c59515dd5c33780d7c Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 2 Dec 2025 12:24:03 +0530 Subject: tools/power cpupower: Show C0 in idle-info dump `cpupower idle-info -o` skipped C0 because the loop began at 1: before: states: C1 ... latency[002] residency[00002] C2 ... latency[010] residency[00020] C3 ... latency[133] residency[00600] after: states: C0 ... latency[000] residency[00000] C1 ... latency[002] residency[00002] C2 ... latency[010] residency[00020] C3 ... latency[133] residency[00600] Start iterating at index 0 so the idle report mirrors sysfs and includes C0 stats. Link: https://lore.kernel.org/r/20251202065403.1492807-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpuidle-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index e0d17f0de3fe..81b4763a97d6 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -111,7 +111,7 @@ static void proc_cpuidle_cpu_output(unsigned int cpu) printf(_("max_cstate: C%u\n"), cstates-1); printf(_("maximum allowed latency: %lu usec\n"), max_allowed_cstate); printf(_("states:\t\n")); - for (cstate = 1; cstate < cstates; cstate++) { + for (cstate = 0; cstate < cstates; cstate++) { printf(_(" C%d: " "type[C%d] "), cstate, cstate); printf(_("promotion[--] demotion[--] ")); -- cgit v1.2.3 From 0355911ac021a424b18d2d746536d70b879cdeab Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:21 -0500 Subject: selftests/bpf: Explicitly account for globals in verifier_arena_large The big_alloc1 test in verifier_arena_large assumes that the arena base and the first page allocated by bpf_arena_alloc_pages are identical. This is not the case, because the first page in the arena is populated by global arena data. The test still passes because the code makes the tacit assumption that the first page is on offset PAGE_SIZE instead of 0. Make this distinction explicit in the code, and adjust the page offsets requested during the test to count from the beginning of the arena instead of using the address of the first allocated page. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-2-emil@etsalapatis.com --- tools/testing/selftests/bpf/progs/verifier_arena_large.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index f19e15400b3e..bd430a34c3ab 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -23,18 +23,25 @@ int big_alloc1(void *ctx) { #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) volatile char __arena *page1, *page2, *no_page, *page3; - void __arena *base; + u64 base; - page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + base = (u64)arena_base(&arena); + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); if (!page1) return 1; + + /* Account for global arena data. */ + if ((u64)page1 != base + PAGE_SIZE) + return 15; + *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE * 2, + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; - no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE, + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) return 3; -- cgit v1.2.3 From 12a1fe6e12dbad39f2f0dad1a385625f0298eff4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:22 -0500 Subject: bpf/verifier: Do not limit maximum direct offset into arena map The verifier currently limits direct offsets into a map to 512MiB to avoid overflow during pointer arithmetic. However, this prevents arena maps from using direct addressing instructions to access data at the end of > 512MiB arena maps. This is necessary when moving arena globals to the end of the arena instead of the front. Refactor the verifier code to remove the offset calculation during direct value access calculations. This is possible because the only two map types that implement .map_direct_value_addr() are arrays and arenas, and they both do their own internal checks to ensure the offset is within bounds. Adjust selftests that expect the old error. These tests still fail because the verifier identifies the access as out of bounds for the map, so change them to expect an "invalid access to map value pointer" error instead. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-3-emil@etsalapatis.com --- kernel/bpf/verifier.c | 5 ----- tools/testing/selftests/bpf/verifier/direct_value_access.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a31c032b2dd6..d6b8a77fbe3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,11 +21132,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } else { u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - return -EINVAL; - } - if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index c0648dc009b5..e569d119fb60 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -81,7 +81,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 4294967295 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=4294967295", }, { "direct map access, write test 8", @@ -141,7 +141,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 536870912 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=536870912", }, { "direct map access, write test 13", -- cgit v1.2.3 From 0aa721437e4b74d737f58582f1bbf2eea3e038c7 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:23 -0500 Subject: libbpf: Turn relo_core->sym_off unsigned The symbols' relocation offsets in BPF are stored in an int field, but cannot actually be negative. When in the next patch libbpf relocates globals to the end of the arena, it is also possible to have valid offsets > 2GiB that are used to calculate the final relo offsets. Avoid accidentally interpreting large offsets as negative by turning the sym_off field unsigned. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-4-emil@etsalapatis.com --- tools/lib/bpf/libbpf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c7c79014d46c..4d4badb64824 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -380,7 +380,7 @@ struct reloc_desc { const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ struct { int map_idx; - int sym_off; + unsigned int sym_off; /* * The following two fields can be unionized, as the * ext_idx field is used for extern symbols, and the @@ -763,7 +763,7 @@ struct bpf_object { struct { struct bpf_program *prog; - int sym_off; + unsigned int sym_off; int fd; } *jumptable_maps; size_t jumptable_map_cnt; @@ -6192,7 +6192,7 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; } -static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off) +static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off) { size_t i; @@ -6210,7 +6210,7 @@ static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym return -ENOENT; } -static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd) +static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off, int map_fd) { size_t cnt = obj->jumptable_map_cnt; size_t size = sizeof(obj->jumptable_maps[0]); @@ -6244,7 +6244,7 @@ static int find_subprog_idx(struct bpf_program *prog, int insn_idx) static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo) { const __u32 jt_entry_size = 8; - int sym_off = relo->sym_off; + unsigned int sym_off = relo->sym_off; int jt_size = relo->sym_size; __u32 max_entries = jt_size / jt_entry_size; __u32 value_size = sizeof(struct bpf_insn_array_value); @@ -6260,7 +6260,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc return map_fd; if (sym_off % jt_entry_size) { - pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n", + pr_warn("map '.jumptables': jumptable start %u should be multiple of %u\n", sym_off, jt_entry_size); return -EINVAL; } @@ -6316,7 +6316,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc * should contain values that fit in u32. */ if (insn_off > UINT32_MAX) { - pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n", + pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %u\n", (long long)jt[i], sym_off + i * jt_entry_size); err = -EINVAL; goto err_close; -- cgit v1.2.3 From c1f61171d44b19834cf24def2cf832f2688e83df Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:24 -0500 Subject: libbpf: Move arena globals to the end of the arena Arena globals are currently placed at the beginning of the arena by libbpf. This is convenient, but prevents users from reserving guard pages in the beginning of the arena to identify NULL pointer dereferences. Adjust the load logic to place the globals at the end of the arena instead. Also modify bpftool to set the arena pointer in the program's BPF skeleton to point to the globals. Users now call bpf_map__initial_value() to find the beginning of the arena mapping and use the arena pointer in the skeleton to determine which part of the mapping holds the arena globals and which part is free. Suggested-by: Andrii Nakryiko Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-5-emil@etsalapatis.com --- tools/lib/bpf/libbpf.c | 17 +++++++++++++---- .../testing/selftests/bpf/progs/verifier_arena_large.c | 12 +++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4d4badb64824..6fba879492a8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -757,6 +757,7 @@ struct bpf_object { int arena_map_idx; void *arena_data; size_t arena_data_sz; + size_t arena_data_off; void *jumptables_data; size_t jumptables_data_sz; @@ -2991,10 +2992,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, void *data, size_t data_sz) { const long page_sz = sysconf(_SC_PAGE_SIZE); + const size_t data_alloc_sz = roundup(data_sz, page_sz); size_t mmap_sz; mmap_sz = bpf_map_mmap_sz(map); - if (roundup(data_sz, page_sz) > mmap_sz) { + if (data_alloc_sz > mmap_sz) { pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", sec_name, mmap_sz, data_sz); return -E2BIG; @@ -3006,6 +3008,9 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, memcpy(obj->arena_data, data, data_sz); obj->arena_data_sz = data_sz; + /* place globals at the end of the arena */ + obj->arena_data_off = mmap_sz - data_alloc_sz; + /* make bpf_map__init_value() work for ARENA maps */ map->mmaped = obj->arena_data; @@ -4663,7 +4668,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_DATA; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = obj->arena_map_idx; - reloc_desc->sym_off = sym->st_value; + reloc_desc->sym_off = sym->st_value + obj->arena_data_off; map = &obj->maps[obj->arena_map_idx]; pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n", @@ -5624,7 +5629,8 @@ retry: return err; } if (obj->arena_data) { - memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + memcpy(map->mmaped + obj->arena_data_off, obj->arena_data, + obj->arena_data_sz); zfree(&obj->arena_data); } } @@ -14429,7 +14435,10 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) if (!map_skel->mmaped) continue; - *map_skel->mmaped = map->mmaped; + if (map->def.type == BPF_MAP_TYPE_ARENA) + *map_skel->mmaped = map->mmaped + map->obj->arena_data_off; + else + *map_skel->mmaped = map->mmaped; } return 0; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index bd430a34c3ab..2b8cf2a4d880 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -31,16 +31,22 @@ int big_alloc1(void *ctx) if (!page1) return 1; - /* Account for global arena data. */ - if ((u64)page1 != base + PAGE_SIZE) + if ((u64)page1 != base) return 15; *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE), + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - 2 * PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; + + /* Test for the guard region at the end of the arena. */ + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE - PAGE_SIZE, + 1, NUMA_NO_NODE, 0); + if (no_page) + return 16; + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) -- cgit v1.2.3 From 19f12431b6c339416e656c794a26ff0ebb2dba56 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:25 -0500 Subject: selftests/bpf: Add tests for the arena offset of globals Add tests for the new libbpf globals arena offset logic. The tests cover the case of globals being as large as the arena itself, and being smaller than the arena. In that case, the data is placed at the end of the arena, and the beginning of the arena is free. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-6-emil@etsalapatis.com --- tools/testing/selftests/bpf/prog_tests/verifier.c | 4 + .../selftests/bpf/progs/verifier_arena_globals1.c | 87 ++++++++++++++++++++++ .../selftests/bpf/progs/verifier_arena_globals2.c | 49 ++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals1.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals2.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 4b4b081b46cc..5829ffd70f8f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -6,6 +6,8 @@ #include "verifier_and.skel.h" #include "verifier_arena.skel.h" #include "verifier_arena_large.skel.h" +#include "verifier_arena_globals1.skel.h" +#include "verifier_arena_globals2.skel.h" #include "verifier_array_access.skel.h" #include "verifier_async_cb_context.skel.h" #include "verifier_basic_stack.skel.h" @@ -147,6 +149,8 @@ static void run_tests_aux(const char *skel_name, void test_verifier_and(void) { RUN(verifier_and); } void test_verifier_arena(void) { RUN(verifier_arena); } void test_verifier_arena_large(void) { RUN(verifier_arena_large); } +void test_verifier_arena_globals1(void) { RUN(verifier_arena_globals1); } +void test_verifier_arena_globals2(void) { RUN(verifier_arena_globals2); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c new file mode 100644 index 000000000000..14afef3d6442 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include "bpf_experimental.h" +#include "bpf_arena_common.h" +#include "bpf_misc.h" + +#define ARENA_PAGES (1UL<< (32 - 12)) +#define GLOBAL_PAGES (16) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Global data, to be placed at the end of the arena. + */ +volatile char __arena global_data[GLOBAL_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve1(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0x5a; + __u8 __arena *guard, *globals; + volatile char __arena *ptr; + int i; + int ret; + + guard = (void __arena *)arena_base(&arena); + globals = (void __arena *)(arena_base(&arena) + (ARENA_PAGES - GLOBAL_PAGES) * PAGE_SIZE); + + /* Reserve the region we've offset the globals by. */ + ret = bpf_arena_reserve_pages(&arena, guard, ARENA_PAGES - GLOBAL_PAGES); + if (ret) + return 1; + + /* Make sure the globals are in the expected offset. */ + ret = bpf_arena_reserve_pages(&arena, globals, 1); + if (!ret) + return 2; + + /* Verify globals are properly mapped in by libbpf. */ + for (i = 0; i < GLOBAL_PAGES; i++) { + ptr = &global_data[i][PAGE_SIZE / 2]; + + *ptr = magic; + if (*ptr != magic) + return i + 3; + } +#endif + return 0; +} + +/* + * Relocation check by reading directly into the global data w/o using symbols. + */ +SEC("syscall") +__success __retval(0) +int check_relocation(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0xfa; + u8 __arena *ptr; + + global_data[GLOBAL_PAGES - 1][PAGE_SIZE / 2] = magic; + ptr = (u8 __arena *)((u64)(ARENA_PAGES * PAGE_SIZE - PAGE_SIZE / 2)); + if (*ptr != magic) + return 1; + +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c new file mode 100644 index 000000000000..e6bd7b61f9f1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +#define ARENA_PAGES (32) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Fill the entire arena with global data. + * The offset into the arena should be 0. + */ +char __arena global_data[ARENA_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve2(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + void __arena *guard; + int ret; + + guard = (void __arena *)arena_base(&arena); + + /* Make sure the data at offset 0 case is properly handled. */ + ret = bpf_arena_reserve_pages(&arena, guard, 1); + if (!ret) + return 1; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d311783bc68b011c77a4ef81321de2c94d7deffc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Dec 2025 19:17:53 -0300 Subject: perf list: Remove unused 'sep' variable It is just being set to the return of strchr() but never used, just ditch it and with it get rid of a warning about it not being const on fedora 44. Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/20251211221756.96294-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 5cbca0bacd35..ac7bd0e41aa1 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -648,7 +648,7 @@ int cmd_list(int argc, const char **argv) } for (i = 0; i < argc; ++i) { - char *sep, *s; + char *s; if (strcmp(argv[i], "tracepoint") == 0) { char *old_pmu_glob = default_ps.pmu_glob; @@ -720,7 +720,7 @@ int cmd_list(int argc, const char **argv) else if (strcmp(argv[i], "pfm") == 0) print_libpfm_events(&print_cb, ps); #endif - else if ((sep = strchr(argv[i], ':')) != NULL) { + else if (strchr(argv[i], ':') != NULL) { char *old_pmu_glob = ps->pmu_glob; char *old_event_glob = ps->event_glob; -- cgit v1.2.3 From f6f41aef53761517391b6192fe5b4bc30b2d717a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Dec 2025 19:17:54 -0300 Subject: perf diff: Constify strchr() return variables Newer glibc versions return const char for strchr() when the 's' arg is const, change the return variable to const to match that. Also we don't need to turn that ',' into a '\0', as strtol() will stop in the first invalid char. No need to touch read only memory. First noticed with fedora 44. Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/20251211221756.96294-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-diff.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 53d5ea4a6a4f..59bf1f72d12e 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -178,10 +178,9 @@ static struct header_column { } }; -static int setup_compute_opt_wdiff(char *opt) +static int setup_compute_opt_wdiff(const char *opt) { - char *w1_str = opt; - char *w2_str; + const char *w1_str = opt, *w2_str; int ret = -EINVAL; @@ -192,8 +191,7 @@ static int setup_compute_opt_wdiff(char *opt) if (!w2_str) goto out; - *w2_str++ = 0x0; - if (!*w2_str) + if (!*++w2_str) goto out; compute_wdiff_w1 = strtol(w1_str, NULL, 10); @@ -214,7 +212,7 @@ static int setup_compute_opt_wdiff(char *opt) return ret; } -static int setup_compute_opt(char *opt) +static int setup_compute_opt(const char *opt) { if (compute == COMPUTE_WEIGHTED_DIFF) return setup_compute_opt_wdiff(opt); @@ -234,7 +232,7 @@ static int setup_compute(const struct option *opt, const char *str, char *cstr = (char *) str; char buf[50]; unsigned i; - char *option; + const char *option; if (!str) { *cp = COMPUTE_DELTA; -- cgit v1.2.3 From 45718bce7daf39c618188b70a52644bb5a2f968a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Dec 2025 19:17:55 -0300 Subject: perf tools: Use const for variables receiving str{str,r?chr}() returns Newer glibc versions return const char for str{str,chr}() where the haystack/s is const so to avoid warnings like these on fedora 44 change some variables to const: 36 8.17 fedora:44 : FAIL gcc version 15.2.1 20251111 (Red Hat 15.2.1-4) (GCC) libbpf.c: In function 'kallsyms_cb': libbpf.c:8489:13: error: assignment discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 8489 | res = strstr(sym_name, ".llvm."); Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/20251211221756.96294-4-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/jvmti/libjvmti.c | 2 +- tools/perf/tests/parse-events.c | 4 ++-- tools/perf/util/evlist.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c index 82514e6532b8..87bfd4781003 100644 --- a/tools/perf/jvmti/libjvmti.c +++ b/tools/perf/jvmti/libjvmti.c @@ -142,7 +142,7 @@ copy_class_filename(const char * class_sign, const char * file_name, char * resu */ if (*class_sign == 'L') { size_t j, i = 0; - char *p = strrchr(class_sign, '/'); + const char *p = strrchr(class_sign, '/'); if (p) { /* drop the 'L' prefix and copy up to the final '/' */ for (i = 0; i < (size_t)(p - class_sign); i++) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 128d21dc389f..2bd622972114 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2609,8 +2609,8 @@ static int test_events(const struct evlist_test *events, int cnt) for (int i = 0; i < cnt; i++) { struct evlist_test e = events[i]; int test_ret; - const char *pos = e.name; - char buf[1024], *buf_pos = buf, *end; + const char *pos = e.name, *end; + char buf[1024], *buf_pos = buf; while ((end = strstr(pos, "default_core"))) { size_t len = end - pos; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 03674d2cbd01..649519628541 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1945,7 +1945,8 @@ out_free: int evlist__parse_control(const char *str, int *ctl_fd, int *ctl_fd_ack, bool *ctl_fd_close) { - char *comma = NULL, *endptr = NULL; + const char *comma = NULL; + char *endptr = NULL; *ctl_fd_close = false; -- cgit v1.2.3 From c85eff00cf296c146a2b189166eaf85188cd1487 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Dec 2025 19:17:56 -0300 Subject: perf trace: Don't change const char strings We got away with this so far but now with fedora 44 complaining about the return value of strchr et all, lets use strdup for good measure. Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/20251211221756.96294-5-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index baee1f695600..d49c1ae409d7 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -5173,8 +5173,8 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int unset __maybe_unused) { struct trace *trace = (struct trace *)opt->value; - const char *s = str; - char *sep = NULL, *lists[2] = { NULL, NULL, }; + const char *s; + char *strd, *sep = NULL, *lists[2] = { NULL, NULL, }; int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; @@ -5183,6 +5183,10 @@ static int trace__parse_events_option(const struct option *opt, const char *str, if (strace_groups_dir == NULL) return -1; + s = strd = strdup(str); + if (strd == NULL) + return -1; + if (*s == '!') { ++s; trace->not_ev_qualifier = true; @@ -5257,8 +5261,7 @@ out: free(strace_groups_dir); free(lists[0]); free(lists[1]); - if (sep) - *sep = ','; + free(strd); return err; } -- cgit v1.2.3 From cddfb3611275697b88031c473d656dc27da34480 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 31 Oct 2025 09:26:37 -0700 Subject: perf vendor power9 nest metrics: Correct unit from MB to MiB 6.1e-5 is very close to 1/16384, where 16384 is 2^14, i.e. a power of 2. When units are in powers of 2 the IEC unit is MiB (mebibytes) rather than MB (megabytes) where the values are powers of 10. This patch corrects the unit for uniformity and because such units may be pattern matched against. Reviewed-by: Madhavan Srinivasan Signed-off-by: Ian Rogers Cc: Athira Rajeev Cc: Kajol Jain Link: https://lore.kernel.org/r/20251031162637.1456191-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json index 7a5d1bf543f8..8d028a7c2777 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json @@ -29,25 +29,25 @@ "MetricExpr" : "nest_mcs01_imc@PM_MCS01_128B_RD_DISP_PORT01@ + nest_mcs01_imc@PM_MCS01_128B_RD_DISP_PORT23@", "MetricName" : "mcs01-read", "MetricGroup" : "memory-bandwidth", - "ScaleUnit": "6.1e-5MB" + "ScaleUnit": "6.1e-5MiB" }, { "MetricExpr" : "nest_mcs23_imc@PM_MCS23_128B_RD_DISP_PORT01@ + nest_mcs23_imc@PM_MCS23_128B_RD_DISP_PORT23@", "MetricName" : "mcs23-read", "MetricGroup" : "memory-bandwidth", - "ScaleUnit": "6.1e-5MB" + "ScaleUnit": "6.1e-5MiB" }, { "MetricExpr" : "nest_mcs01_imc@PM_MCS01_128B_WR_DISP_PORT01@ + nest_mcs01_imc@PM_MCS01_128B_WR_DISP_PORT23@", "MetricName" : "mcs01-write", "MetricGroup" : "memory-bandwidth", - "ScaleUnit": "6.1e-5MB" + "ScaleUnit": "6.1e-5MiB" }, { "MetricExpr" : "nest_mcs23_imc@PM_MCS23_128B_WR_DISP_PORT01@ + nest_mcs23_imc@PM_MCS23_128B_WR_DISP_PORT23@", "MetricName" : "mcs23-write", "MetricGroup" : "memory-bandwidth", - "ScaleUnit": "6.1e-5MB" + "ScaleUnit": "6.1e-5MiB" }, { "MetricExpr" : "nest_powerbus0_imc@PM_PB_CYC@", -- cgit v1.2.3 From bdd051e249141c793dec28544e7f5d5bc7690bf3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Dec 2025 18:33:24 -0800 Subject: perf record: Split --data-mmap option Currently -d/--data option controls both PERF_SAMPLE_ADDR bit and perf_event_attr.mmap_data flag. Separate them using new --data-mmap option to support recording only one of them. For data-type profiling, data MMAP is unnecessary but it wastes a lot of space in the ring buffer and data file. Committer testing: On an idle system: root@x1:~# perf record -d -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 5.672 MB perf.data (1075 samples) ] root@x1:~# ls -la perf.data -rw-------. 1 root root 5982480 Dec 16 15:34 perf.data root@x1:~# perf evlist -v cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0xa00000000 (cpu_atom/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1 cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0x400000000 (cpu_core/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1 dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 144, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC, read_format: ID|LOST, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, build_id: 1 root@x1:~# Now with just --data-mmap we will not save that much, as only DATA_SRC will not be enabled in sample_type: root@x1:~# perf record --data-mmap -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 5.576 MB perf.data (716 samples) ] root@x1:~# ls -la perf.data -rw-------. 1 root root 5880112 Dec 16 15:37 perf.data root@x1:~# perf evlist -v cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0xa00000000 (cpu_atom/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1 cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0x400000000 (cpu_core/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1 dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 144, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER, read_format: ID|LOST, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, build_id: 1 root@x1:~# To complete, just with DATA_SRC, no mmap_data: root@x1:~# perf record --sample-mem-info -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.407 MB perf.data (1311 samples) ] root@x1:~# ls -la perf.data -rw-------. 1 root root 1509224 Dec 16 15:40 perf.data root@x1:~# perf evlist -v cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0xa00000000 (cpu_atom/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1 cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0x400000000 (cpu_core/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1 dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 144, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER|DATA_SRC, read_format: ID|LOST, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, build_id: 1 root@x1:~# Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 8 +++++++- tools/perf/builtin-record.c | 19 +++++++++++++------ tools/perf/util/evsel.c | 5 +++-- tools/perf/util/record.h | 2 ++ 4 files changed, 25 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index e8b9aadbbfa5..c402e74172f6 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -344,7 +344,8 @@ OPTIONS -d:: --data:: - Record the sample virtual addresses. Implies --sample-mem-info. + Record the sample virtual addresses. Implies --sample-mem-info and + --data-mmap. --phys-data:: Record the sample physical addresses. @@ -861,6 +862,11 @@ filtered through the mask provided by -C option. Prepare BPF filter to be used by regular users. The action should be either "pin" or "unpin". The filter can be used after it's pinned. +--data-mmap:: + Enable recording MMAP events for non-executable mappings. Basically + perf only records executable mappings but data mmaping can be useful + when you analyze data access with sample addresses. So using -d option + would enable this unless you specify --no-data-mmap manually. include::intel-hybrid.txt[] diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2584d0d8bc82..cbfbd9bb1063 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1881,7 +1881,7 @@ static int record__synthesize_workload(struct record *rec, bool tail) process_synthesized_event, &rec->session->machines.host, needs_mmap, - rec->opts.sample_address); + rec->opts.record_data_mmap); perf_thread_map__put(thread_map); return err; } @@ -2191,7 +2191,7 @@ static int record__synthesize(struct record *rec, bool tail) err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, - f, needs_mmap, opts->sample_address, + f, needs_mmap, opts->record_data_mmap, rec->opts.nr_threads_synthesize); } @@ -3006,8 +3006,9 @@ int record_opts__parse_callchain(struct record_opts *record, ret = parse_callchain_record_opt(arg, callchain); if (!ret) { /* Enable data address sampling for DWARF unwind. */ - if (callchain->record_mode == CALLCHAIN_DWARF) - record->sample_address = true; + if (callchain->record_mode == CALLCHAIN_DWARF && + !record->record_data_mmap_set) + record->record_data_mmap = true; callchain_debug(callchain); } @@ -3686,6 +3687,9 @@ static struct option __record_options[] = { OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms", "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)", record__parse_off_cpu_thresh), + OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap, + &record.opts.record_data_mmap_set, + "Record mmap events for non-executable mappings"), OPT_END() }; @@ -4249,9 +4253,12 @@ int cmd_record(int argc, const char **argv) goto out_opts; } - /* For backward compatibility, -d implies --mem-info */ - if (rec->opts.sample_address) + /* For backward compatibility, -d implies --mem-info and --data-mmap */ + if (rec->opts.sample_address) { rec->opts.sample_data_src = true; + if (!rec->opts.record_data_mmap_set) + rec->opts.record_data_mmap = true; + } /* * Allow aliases to facilitate the lookup of symbols for address diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9cd706f62793..ec6552a6f667 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1445,10 +1445,11 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, attr->inherit_stat = 1; } - if (opts->sample_address) { + if (opts->sample_address) evsel__set_sample_bit(evsel, ADDR); + + if (opts->record_data_mmap) attr->mmap_data = track; - } /* * We don't allow user space callchains for function trace diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h index ea3a6c4657ee..93627c9a7338 100644 --- a/tools/perf/util/record.h +++ b/tools/perf/util/record.h @@ -40,6 +40,8 @@ struct record_opts { bool record_cgroup; bool record_switch_events; bool record_switch_events_set; + bool record_data_mmap; + bool record_data_mmap_set; bool all_kernel; bool all_user; bool kernel_callchains; -- cgit v1.2.3 From 9cdc9738d169f82bd4abb638b2ac8690bdee5522 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Dec 2025 18:33:25 -0800 Subject: perf report: Enable data-type profiling with -F option too It checked -s/--sort options only. As the sort keys can be setup using the -F/--fields option as well, it should enable data-type profiling with it too. The following two commands should have the same output. $ perf report -s type $ perf report -F overhead,type But there's another problem on this. I'll handle it in the next commit. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index add6b1c2aaf0..6c2b4f93ec78 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1727,7 +1727,8 @@ repeat: sort_order = NULL; } - if (sort_order && strstr(sort_order, "type")) { + if ((sort_order && strstr(sort_order, "type")) || + (field_order && strstr(field_order, "type"))) { report.data_type = true; annotate_opts.annotate_src = false; -- cgit v1.2.3 From 5d35d829bb0b19ee51be9732e3b5f81abc7ef3bb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Dec 2025 18:33:26 -0800 Subject: perf report: Fix histogram entry collapsing for -F option Users can use -F/--fields option to set output fields and sort keys together. But it missed to set perf_hpp_list->need_collapse for sort entries that have se_collapse callbacks. So it ends up with having duplicated entries separately. For example, let's run this command first. $ perf mem record -t load -U -- perf test -w datasym This will record samples for memory access (load) to struct 'buf' and a loop condition ('sig_atomic_t') types. So the following two commands should have identical output. $ perf report -s type --stdio --percent-limit=1 -q 87.80% perf buf 12.17% perf sig_atomic_t But using -F option didn't collapse the entries based on types so the result looked like below: $ perf report -F overhead,type --stdio --percent-limit=1 -q 23.31% perf buf 22.84% perf buf 21.26% perf buf 20.39% perf buf 12.17% perf sig_atomic_t Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/sort.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index f3a565b0e230..3d4b68fd6e44 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3585,6 +3585,9 @@ static int __sort_dimension__add_output(struct perf_hpp_list *list, if (__sort_dimension__add_hpp_output(sd, list, level) < 0) return -1; + if (sd->entry->se_collapse) + list->need_collapse = 1; + sd->taken = 1; return 0; } -- cgit v1.2.3 From cbd41c6d4c26c161a2b0e70ad411d3885ff13507 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Dec 2025 18:33:27 -0800 Subject: perf report: Update sort key state from -F option Factor out __sort_dimension__update() so that it can be called from -s and -F option parsing logics. Otherwise the following command cannot go into the annotation mode. $ perf report -F overhead,type,sym Warning: Annotation is only available for symbolic views, include "sym*" in --sort to use it. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/sort.c | 100 ++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 3d4b68fd6e44..f963d61ac166 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3538,6 +3538,56 @@ out: return ret; } +static int __sort_dimension__update(struct sort_dimension *sd, + struct perf_hpp_list *list) +{ + if (sd->entry == &sort_parent && parent_pattern) { + int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED); + if (ret) { + char err[BUFSIZ]; + + regerror(ret, &parent_regex, err, sizeof(err)); + pr_err("Invalid regex: %s\n%s", parent_pattern, err); + return -EINVAL; + } + list->parent = 1; + } else if (sd->entry == &sort_sym) { + list->sym = 1; + /* + * perf diff displays the performance difference amongst + * two or more perf.data files. Those files could come + * from different binaries. So we should not compare + * their ips, but the name of symbol. + */ + if (sort__mode == SORT_MODE__DIFF) + sd->entry->se_collapse = sort__sym_sort; + + } else if (sd->entry == &sort_sym_offset) { + list->sym = 1; + } else if (sd->entry == &sort_dso) { + list->dso = 1; + } else if (sd->entry == &sort_socket) { + list->socket = 1; + } else if (sd->entry == &sort_thread) { + list->thread = 1; + } else if (sd->entry == &sort_comm) { + list->comm = 1; + } else if (sd->entry == &sort_type_offset) { + symbol_conf.annotate_data_member = true; + } else if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) { + list->sym = 1; + } else if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0) { + return -EINVAL; + } else if (sd->entry == &sort_mem_daddr_sym) { + list->sym = 1; + } + + if (sd->entry->se_collapse) + list->need_collapse = 1; + + return 0; +} + static int __sort_dimension__add(struct sort_dimension *sd, struct perf_hpp_list *list, int level) @@ -3548,8 +3598,8 @@ static int __sort_dimension__add(struct sort_dimension *sd, if (__sort_dimension__add_hpp_sort(sd, list, level) < 0) return -1; - if (sd->entry->se_collapse) - list->need_collapse = 1; + if (__sort_dimension__update(sd, list) < 0) + return -1; sd->taken = 1; @@ -3585,8 +3635,8 @@ static int __sort_dimension__add_output(struct perf_hpp_list *list, if (__sort_dimension__add_hpp_output(sd, list, level) < 0) return -1; - if (sd->entry->se_collapse) - list->need_collapse = 1; + if (__sort_dimension__update(sd, list) < 0) + return -1; sd->taken = 1; return 0; @@ -3651,39 +3701,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, sort_dimension_add_dynamic_header(sd, env); } - if (sd->entry == &sort_parent && parent_pattern) { - int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED); - if (ret) { - char err[BUFSIZ]; - - regerror(ret, &parent_regex, err, sizeof(err)); - pr_err("Invalid regex: %s\n%s", parent_pattern, err); - return -EINVAL; - } - list->parent = 1; - } else if (sd->entry == &sort_sym) { - list->sym = 1; - /* - * perf diff displays the performance difference amongst - * two or more perf.data files. Those files could come - * from different binaries. So we should not compare - * their ips, but the name of symbol. - */ - if (sort__mode == SORT_MODE__DIFF) - sd->entry->se_collapse = sort__sym_sort; - - } else if (sd->entry == &sort_dso) { - list->dso = 1; - } else if (sd->entry == &sort_socket) { - list->socket = 1; - } else if (sd->entry == &sort_thread) { - list->thread = 1; - } else if (sd->entry == &sort_comm) { - list->comm = 1; - } else if (sd->entry == &sort_type_offset) { - symbol_conf.annotate_data_member = true; - } - return __sort_dimension__add(sd, list, level); } @@ -3702,9 +3719,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, strlen(tok))) return -EINVAL; - if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) - list->sym = 1; - __sort_dimension__add(sd, list, level); return 0; } @@ -3718,12 +3732,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, if (sort__mode != SORT_MODE__MEMORY) return -EINVAL; - if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0) - return -EINVAL; - - if (sd->entry == &sort_mem_daddr_sym) - list->sym = 1; - __sort_dimension__add(sd, list, level); return 0; } -- cgit v1.2.3 From a05385d84b2af64600fc84b027bea481e8f6261d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:44 -0800 Subject: perf/x86/core: Register a new vector for handling mediated guest PMIs Wire up system vector 0xf5 for handling PMIs (i.e. interrupts delivered through the LVTPC) while running KVM guests with a mediated PMU. Perf currently delivers all PMIs as NMIs, e.g. so that events that trigger while IRQs are disabled aren't delayed and generate useless records, but due to the multiplexing of NMIs throughout the system, correctly identifying NMIs for a mediated PMU is practically infeasible. To (greatly) simplify identifying guest mediated PMU PMIs, perf will switch the CPU's LVTPC between PERF_GUEST_MEDIATED_PMI_VECTOR and NMI when guest PMU context is loaded/put. I.e. PMIs that are generated by the CPU while the guest is active will be identified purely based on the IRQ vector. Route the vector through perf, e.g. as opposed to letting KVM attach a handler directly a la posted interrupt notification vectors, as perf owns the LVTPC and thus is the rightful owner of PERF_GUEST_MEDIATED_PMI_VECTOR. Functionally, having KVM directly own the vector would be fine (both KVM and perf will be completely aware of when a mediated PMU is active), but would lead to an undesirable split in ownership: perf would be responsible for installing the vector, but not handling the resulting IRQs. Add a new perf_guest_info_callbacks hook (and static call) to allow KVM to register its handler with perf when running guests with mediated PMUs. Note, because KVM always runs guests with host IRQs enabled, there is no danger of a PMI being delayed from the guest's perspective due to using a regular IRQ instead of an NMI. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-9-seanjc@google.com --- arch/x86/entry/entry_fred.c | 1 + arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/include/asm/idtentry.h | 6 ++++++ arch/x86/include/asm/irq_vectors.h | 4 +++- arch/x86/kernel/idt.c | 3 +++ arch/x86/kernel/irq.c | 19 +++++++++++++++++++ include/linux/perf_event.h | 8 ++++++++ kernel/events/core.c | 9 +++++++-- .../trace/beauty/arch/x86/include/asm/irq_vectors.h | 3 ++- virt/kvm/kvm_main.c | 3 +++ 10 files changed, 55 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 94e626cc6a07..a9b72997103d 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -114,6 +114,7 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(IRQ_WORK_VECTOR, irq_work), + SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, perf_guest_mediated_pmi_handler), SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 6b6d472baa0b..9314642ae93c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,9 @@ typedef struct { unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; +#endif +#ifdef CONFIG_GUEST_PERF_EVENTS + unsigned int perf_guest_mediated_pmis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 3218770670d3..42bf6a58ec36 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -746,6 +746,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_GUEST_PERF_EVENTS +DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, sysvec_perf_guest_mediated_pmi_handler); +#else +# define fred_sysvec_perf_guest_mediated_pmi_handler NULL +#endif + # ifdef CONFIG_X86_POSTED_MSI DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); #else diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 47051871b436..85253fc8e384 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,9 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +/* IRQ vector for PMIs when running a guest with a mediated PMU. */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f445bec516a0..260456588756 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -158,6 +158,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif +#ifdef CONFIG_GUEST_PERF_EVENTS + INTG(PERF_GUEST_MEDIATED_PMI_VECTOR, asm_sysvec_perf_guest_mediated_pmi_handler), +#endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 86f4e574de02..d56185b49a0e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -192,6 +192,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_GUEST_PERF_EVENTS + seq_printf(p, "%*s: ", prec, "VPMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->perf_guest_mediated_pmis); + seq_puts(p, " Perf Guest Mediated PMI\n"); +#endif #ifdef CONFIG_X86_POSTED_MSI seq_printf(p, "%*s: ", prec, "PMN"); for_each_online_cpu(j) @@ -349,6 +356,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +/* + * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler) +{ + apic_eoi(); + inc_irq_stat(perf_guest_mediated_pmis); + perf_guest_handle_mediated_pmi(); +} +#endif + #if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 322cfa9f3d48..82e617fad165 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1677,6 +1677,8 @@ struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); + + void (*handle_mediated_pmi)(void); }; #ifdef CONFIG_GUEST_PERF_EVENTS @@ -1686,6 +1688,7 @@ extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); static inline unsigned int perf_guest_state(void) { @@ -1702,6 +1705,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) return static_call(__perf_guest_handle_intel_pt_intr)(); } +static inline void perf_guest_handle_mediated_pmi(void) +{ + static_call(__perf_guest_handle_mediated_pmi)(); +} + extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); diff --git a/kernel/events/core.c b/kernel/events/core.c index bbb81a4a3196..dd842a4ca789 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7644,6 +7644,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7658,6 +7659,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); + + if (cbs->handle_mediated_pmi) + static_call_update(__perf_guest_handle_mediated_pmi, + cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7669,8 +7674,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, - (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h index 47051871b436..6e1d5b955aae 100644 --- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,8 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5fcd401a5897..21a0d226d63f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6467,11 +6467,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, + .handle_mediated_pmi = NULL, }; void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + kvm_guest_cbs.handle_mediated_pmi = NULL; + perf_register_guest_info_callbacks(&kvm_guest_cbs); } void kvm_unregister_perf_callbacks(void) -- cgit v1.2.3 From c1c7d61746f42154bad30bc4adb88f5374969408 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:14 -0800 Subject: resolve_btfids: Rename object btf field to btf_path Rename the member of `struct object` holding the path to BTF data if provided via --btf arg. `btf_path` is less ambiguous. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181321.1283664-2-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index d47191c6e55e..164f0c941f04 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -113,7 +113,7 @@ struct btf_id { struct object { const char *path; - const char *btf; + const char *btf_path; const char *base_btf_path; struct { @@ -550,11 +550,11 @@ static int symbols_resolve(struct object *obj) } } - btf = btf__parse_split(obj->btf ?: obj->path, base_btf); + btf = btf__parse_split(obj->btf_path ?: obj->path, base_btf); err = libbpf_get_error(btf); if (err) { pr_err("FAILED: load BTF from %s: %s\n", - obj->btf ?: obj->path, strerror(-err)); + obj->btf_path ?: obj->path, strerror(-err)); goto out; } @@ -790,8 +790,8 @@ int main(int argc, const char **argv) struct option btfid_options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show errors, etc)"), - OPT_STRING(0, "btf", &obj.btf, "BTF data", - "BTF data"), + OPT_STRING(0, "btf", &obj.btf_path, "file", + "path to a file with input BTF data"), OPT_STRING('b', "btf_base", &obj.base_btf_path, "file", "path of file providing base BTF"), OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings, -- cgit v1.2.3 From 5f347a0f781a36927a547904dde4c4dac343010b Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:15 -0800 Subject: resolve_btfids: Factor out load_btf() Increase the lifetime of parsed BTF in resolve_btfids by factoring load_btf() routine out of symbols_resolve() and storing the base_btf and btf pointers in the struct object. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181321.1283664-3-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/main.c | 47 +++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 164f0c941f04..b4caae1170dd 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -116,6 +116,9 @@ struct object { const char *btf_path; const char *base_btf_path; + struct btf *btf; + struct btf *base_btf; + struct { int fd; Elf *elf; @@ -529,16 +532,10 @@ static int symbols_collect(struct object *obj) return 0; } -static int symbols_resolve(struct object *obj) +static int load_btf(struct object *obj) { - int nr_typedefs = obj->nr_typedefs; - int nr_structs = obj->nr_structs; - int nr_unions = obj->nr_unions; - int nr_funcs = obj->nr_funcs; - struct btf *base_btf = NULL; - int err, type_id; - struct btf *btf; - __u32 nr_types; + struct btf *base_btf = NULL, *btf = NULL; + int err; if (obj->base_btf_path) { base_btf = btf__parse(obj->base_btf_path, NULL); @@ -546,7 +543,7 @@ static int symbols_resolve(struct object *obj) if (err) { pr_err("FAILED: load base BTF from %s: %s\n", obj->base_btf_path, strerror(-err)); - return -1; + goto out_err; } } @@ -555,9 +552,30 @@ static int symbols_resolve(struct object *obj) if (err) { pr_err("FAILED: load BTF from %s: %s\n", obj->btf_path ?: obj->path, strerror(-err)); - goto out; + goto out_err; } + obj->base_btf = base_btf; + obj->btf = btf; + + return 0; + +out_err: + btf__free(base_btf); + btf__free(btf); + return err; +} + +static int symbols_resolve(struct object *obj) +{ + int nr_typedefs = obj->nr_typedefs; + int nr_structs = obj->nr_structs; + int nr_unions = obj->nr_unions; + int nr_funcs = obj->nr_funcs; + struct btf *btf = obj->btf; + int err, type_id; + __u32 nr_types; + err = -1; nr_types = btf__type_cnt(btf); @@ -615,8 +633,6 @@ static int symbols_resolve(struct object *obj) err = 0; out: - btf__free(base_btf); - btf__free(btf); return err; } @@ -824,6 +840,9 @@ int main(int argc, const char **argv) if (symbols_collect(&obj)) goto out; + if (load_btf(&obj)) + goto out; + if (symbols_resolve(&obj)) goto out; @@ -833,6 +852,8 @@ int main(int argc, const char **argv) if (!(fatal_warnings && warnings)) err = 0; out: + btf__free(obj.base_btf); + btf__free(obj.btf); if (obj.efile.elf) { elf_end(obj.efile.elf); close(obj.efile.fd); -- cgit v1.2.3 From a4fa885bd52d1711994ad1ef99e989977cd15698 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:16 -0800 Subject: resolve_btfids: Introduce enum btf_id_kind Instead of using multiple flags, make struct btf_id tagged with an enum value indicating its kind in the context of resolve_btfids. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181321.1283664-4-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/main.c | 83 +++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index b4caae1170dd..e721e20a2bbd 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -98,6 +98,13 @@ # error "Unknown machine endianness!" #endif +enum btf_id_kind { + BTF_ID_KIND_NONE, + BTF_ID_KIND_SYM, + BTF_ID_KIND_SET, + BTF_ID_KIND_SET8 +}; + struct btf_id { struct rb_node rb_node; char *name; @@ -105,9 +112,8 @@ struct btf_id { int id; int cnt; }; + enum btf_id_kind kind; int addr_cnt; - bool is_set; - bool is_set8; Elf64_Addr addr[ADDR_CNT]; }; @@ -197,8 +203,10 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name) return NULL; } -static struct btf_id * -btf_id__add(struct rb_root *root, char *name, bool unique) +static struct btf_id *__btf_id__add(struct rb_root *root, + char *name, + enum btf_id_kind kind, + bool unique) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -221,12 +229,23 @@ btf_id__add(struct rb_root *root, char *name, bool unique) if (id) { pr_debug("adding symbol %s\n", name); id->name = name; + id->kind = kind; rb_link_node(&id->rb_node, parent, p); rb_insert_color(&id->rb_node, root); } return id; } +static inline struct btf_id *btf_id__add(struct rb_root *root, char *name, enum btf_id_kind kind) +{ + return __btf_id__add(root, name, kind, false); +} + +static inline struct btf_id *btf_id__add_unique(struct rb_root *root, char *name, enum btf_id_kind kind) +{ + return __btf_id__add(root, name, kind, true); +} + static char *get_id(const char *prefix_end) { /* @@ -260,22 +279,36 @@ static char *get_id(const char *prefix_end) return id; } -static struct btf_id *add_set(struct object *obj, char *name, bool is_set8) +static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind kind) { + int len = strlen(name); + int prefixlen; + char *id; + /* * __BTF_ID__set__name * name = ^ * id = ^ */ - char *id = name + (is_set8 ? sizeof(BTF_SET8 "__") : sizeof(BTF_SET "__")) - 1; - int len = strlen(name); + switch (kind) { + case BTF_ID_KIND_SET: + prefixlen = sizeof(BTF_SET "__") - 1; + break; + case BTF_ID_KIND_SET8: + prefixlen = sizeof(BTF_SET8 "__") - 1; + break; + default: + pr_err("Unexpected kind %d passed to %s() for symbol %s\n", kind, __func__, name); + return NULL; + } + id = name + prefixlen; if (id >= name + len) { pr_err("FAILED to parse set name: %s\n", name); return NULL; } - return btf_id__add(&obj->sets, id, true); + return btf_id__add_unique(&obj->sets, id, kind); } static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) @@ -288,7 +321,7 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return NULL; } - return btf_id__add(root, id, false); + return btf_id__add(root, id, BTF_ID_KIND_SYM); } /* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ @@ -491,35 +524,31 @@ static int symbols_collect(struct object *obj) id = add_symbol(&obj->funcs, prefix, sizeof(BTF_FUNC) - 1); /* set8 */ } else if (!strncmp(prefix, BTF_SET8, sizeof(BTF_SET8) - 1)) { - id = add_set(obj, prefix, true); + id = add_set(obj, prefix, BTF_ID_KIND_SET8); /* * SET8 objects store list's count, which is encoded * in symbol's size, together with 'cnt' field hence * that - 1. */ - if (id) { + if (id) id->cnt = sym.st_size / sizeof(uint64_t) - 1; - id->is_set8 = true; - } /* set */ } else if (!strncmp(prefix, BTF_SET, sizeof(BTF_SET) - 1)) { - id = add_set(obj, prefix, false); + id = add_set(obj, prefix, BTF_ID_KIND_SET); /* * SET objects store list's count, which is encoded * in symbol's size, together with 'cnt' field hence * that - 1. */ - if (id) { + if (id) id->cnt = sym.st_size / sizeof(int) - 1; - id->is_set = true; - } } else { pr_err("FAILED unsupported prefix %s\n", prefix); return -1; } if (!id) - return -ENOMEM; + return -EINVAL; if (id->addr_cnt >= ADDR_CNT) { pr_err("FAILED symbol %s crossed the number of allowed lists\n", @@ -643,7 +672,7 @@ static int id_patch(struct object *obj, struct btf_id *id) int i; /* For set, set8, id->id may be 0 */ - if (!id->id && !id->is_set && !id->is_set8) { + if (!id->id && id->kind != BTF_ID_KIND_SET && id->kind != BTF_ID_KIND_SET8) { pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name); warnings++; } @@ -696,6 +725,7 @@ static int sets_patch(struct object *obj) { Elf_Data *data = obj->efile.idlist; struct rb_node *next; + int cnt; next = rb_first(&obj->sets); while (next) { @@ -715,11 +745,15 @@ static int sets_patch(struct object *obj) return -1; } - if (id->is_set) { + switch (id->kind) { + case BTF_ID_KIND_SET: set = data->d_buf + off; + cnt = set->cnt; qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); - } else { + break; + case BTF_ID_KIND_SET8: set8 = data->d_buf + off; + cnt = set8->cnt; /* * Make sure id is at the beginning of the pairs * struct, otherwise the below qsort would not work. @@ -744,10 +778,13 @@ static int sets_patch(struct object *obj) bswap_32(set8->pairs[i].flags); } } + break; + default: + pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name); + return -1; } - pr_debug("sorting addr %5lu: cnt %6d [%s]\n", - off, id->is_set ? set->cnt : set8->cnt, id->name); + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", off, cnt, id->name); next = rb_next(next); } -- cgit v1.2.3 From fb348d4fdf5ec08aea8b1f686136584f758e4364 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:17 -0800 Subject: resolve_btfids: Always build with -Wall -Werror resolve_btfids builds without compiler warnings currently, so let's enforce this for future changes with '-Wall -Werror' flags [1]. [1] https://lore.kernel.org/bpf/1957a60b-6c45-42a7-b525-a6e335a735ff@linux.dev/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20251219181321.1283664-5-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index ce1b556dfa90..1733a6e93a07 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -70,7 +70,8 @@ HOSTCFLAGS_resolve_btfids += -g \ -I$(srctree)/tools/include/uapi \ -I$(LIBBPF_INCLUDE) \ -I$(SUBCMD_INCLUDE) \ - $(LIBELF_FLAGS) + $(LIBELF_FLAGS) \ + -Wall -Werror LIBS = $(LIBELF_LIBS) -lz -- cgit v1.2.3 From 903922cfa0e60573234ff895974c23a000035258 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:23 -0800 Subject: lib/Kconfig.debug: Set the minimum required pahole version to v1.22 Subsequent patches in the series change vmlinux linking scripts to unconditionally pass --btf_encode_detached to pahole, which was introduced in v1.22 [1][2]. This change allows to remove PAHOLE_HAS_SPLIT_BTF Kconfig option and other checks of older pahole versions. [1] https://github.com/acmel/dwarves/releases/tag/v1.22 [2] https://lore.kernel.org/bpf/cbafbf4e-9073-4383-8ee6-1353f9e5869c@oracle.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Acked-by: Nicolas Schier Link: https://lore.kernel.org/bpf/20251219181825.1289460-1-ihor.solodrai@linux.dev --- Documentation/process/changes.rst | 4 ++-- Documentation/scheduler/sched-ext.rst | 1 - lib/Kconfig.debug | 13 ++++--------- scripts/Makefile.btf | 9 +-------- tools/sched_ext/README.md | 1 - 5 files changed, 7 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 62951cdb13ad..b7e329159d00 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -38,7 +38,7 @@ bash 4.2 bash --version binutils 2.30 ld -v flex 2.5.35 flex --version bison 2.0 bison --version -pahole 1.16 pahole --version +pahole 1.22 pahole --version util-linux 2.10o mount --version kmod 13 depmod -V e2fsprogs 1.41.4 e2fsck -V @@ -143,7 +143,7 @@ pahole Since Linux 5.2, if CONFIG_DEBUG_INFO_BTF is selected, the build system generates BTF (BPF Type Format) from DWARF in vmlinux, a bit later from kernel -modules as well. This requires pahole v1.16 or later. +modules as well. This requires pahole v1.22 or later. It is found in the 'dwarves' or 'pahole' distro packages or from https://fedorapeople.org/~acme/dwarves/. diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index 404fe6126a76..9e2882d937b4 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -43,7 +43,6 @@ options should be enabled to use sched_ext: CONFIG_DEBUG_INFO_BTF=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y - CONFIG_PAHOLE_HAS_SPLIT_BTF=y CONFIG_PAHOLE_HAS_BTF_TAG=y sched_ext is used only when the BPF scheduler is loaded and running. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ba36939fda79..60281c4f9e99 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -388,18 +388,13 @@ config DEBUG_INFO_BTF depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL - depends on PAHOLE_VERSION >= 116 - depends on DEBUG_INFO_DWARF4 || PAHOLE_VERSION >= 121 + depends on PAHOLE_VERSION >= 122 # pahole uses elfutils, which does not have support for Hexagon relocations depends on !HEXAGON help Generate deduplicated BTF type information from DWARF debug info. - Turning this on requires pahole v1.16 or later (v1.21 or later to - support DWARF 5), which will convert DWARF type info into equivalent - deduplicated BTF type info. - -config PAHOLE_HAS_SPLIT_BTF - def_bool PAHOLE_VERSION >= 119 + Turning this on requires pahole v1.22 or later, which will convert + DWARF type info into equivalent deduplicated BTF type info. config PAHOLE_HAS_BTF_TAG def_bool PAHOLE_VERSION >= 123 @@ -421,7 +416,7 @@ config PAHOLE_HAS_LANG_EXCLUDE config DEBUG_INFO_BTF_MODULES bool "Generate BTF type information for kernel modules" default y - depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF + depends on DEBUG_INFO_BTF && MODULES help Generate compact split BTF type information for kernel modules. diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index db76335dd917..840a55de42da 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -7,14 +7,7 @@ JOBS := $(patsubst -j%,%,$(filter -j%,$(MAKEFLAGS))) ifeq ($(call test-le, $(pahole-ver), 125),y) -# pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars -ifeq ($(call test-le, $(pahole-ver), 121),y) -pahole-flags-$(call test-ge, $(pahole-ver), 118) += --skip_encoding_btf_vars -endif - -pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats - -pahole-flags-$(call test-ge, $(pahole-ver), 122) += -j$(JOBS) +pahole-flags-y += --btf_gen_floats -j$(JOBS) pahole-flags-$(call test-ge, $(pahole-ver), 125) += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 16a42e4060f6..56a9d1557ac4 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -65,7 +65,6 @@ It's also recommended that you also include the following Kconfig options: ``` CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_PAHOLE_HAS_SPLIT_BTF=y CONFIG_PAHOLE_HAS_BTF_TAG=y ``` -- cgit v1.2.3 From 014e1cdb5fad8c6034feb3a97468a91edf23d3d0 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:24 -0800 Subject: selftests/bpf: Run resolve_btfids only for relevant .test.o objects A selftest targeting resolve_btfids functionality relies on a resolved .BTF_ids section to be available in the TRUNNER_BINARY. The underlying BTF data is taken from a special BPF program (btf_data.c), and so resolve_btfids is executed as a part of a TRUNNER_BINARY build recipe on the final binary. Subsequent patches in this series allow resolve_btfids to modify BTF before resolving the symbols, which means that the test needs access to that modified BTF [1]. Currently the test simply reads in btf_data.bpf.o on the assumption that BTF hasn't changed. Implement resolve_btfids call only for particular test objects (just resolve_btfids.test.o for now). The test objects are linked into the TRUNNER_BINARY, and so .BTF_ids section will be available there. This will make it trivial for the resolve_btfids test to access BTF modified by resolve_btfids. [1] https://lore.kernel.org/bpf/CAErzpmvsgSDe-QcWH8SFFErL6y3p3zrqNri5-UHJ9iK2ChyiBw@mail.gmail.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181825.1289460-2-ihor.solodrai@linux.dev --- tools/testing/selftests/bpf/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4aa60e83ff19..ffd0a4c354c7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -643,6 +643,9 @@ $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c ) > $$@) endif +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: $(RESOLVE_BTFIDS) $(TRUNNER_OUTPUT)/btf_data.bpf.o +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: private TEST_NEEDS_BTFIDS = 1 + # compile individual test files # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ @@ -650,6 +653,9 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -695,13 +701,11 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(TRUNNER_LIB_OBJS) \ - $(RESOLVE_BTFIDS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool -- cgit v1.2.3 From 522397d05e7d4a7c30b91841492360336b24f833 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:25 -0800 Subject: resolve_btfids: Change in-place update with raw binary output Currently resolve_btfids updates .BTF_ids section of an ELF file in-place, based on the contents of provided BTF, usually within the same input file, and optionally a BTF base. Change resolve_btfids behavior to enable BTF transformations as part of its main operation. To achieve this, in-place ELF write in resolve_btfids is replaced with generation of the following binaries: * ${1}.BTF with .BTF section data * ${1}.BTF_ids with .BTF_ids section data if it existed in ${1} * ${1}.BTF.base with .BTF.base section data for out-of-tree modules The execution of resolve_btfids and consumption of its output is orchestrated by scripts/gen-btf.sh introduced in this patch. The motivation for emitting binary data is that it allows simplifying resolve_btfids implementation by delegating ELF update to the $OBJCOPY tool [1], which is already widely used across the codebase. There are two distinct paths for BTF generation and resolve_btfids application in the kernel build: for vmlinux and for kernel modules. For the vmlinux binary a .BTF section is added in a roundabout way to ensure correct linking. The patch doesn't change this approach, only the implementation is a little different. Before this patch it worked as follows: * pahole consumed .tmp_vmlinux1 [2] and added .BTF section with llvm-objcopy [3] to it * then everything except the .BTF section was stripped from .tmp_vmlinux1 into a .tmp_vmlinux1.bpf.o object [2], later linked into vmlinux * resolve_btfids was executed later on vmlinux.unstripped [4], updating it in-place After this patch gen-btf.sh implements the following: * pahole consumes .tmp_vmlinux1 and produces a *detached* file with raw BTF data * resolve_btfids consumes .tmp_vmlinux1 and detached BTF to produce (potentially modified) .BTF, and .BTF_ids sections data * a .tmp_vmlinux1.bpf.o object is then produced with objcopy copying BTF output of resolve_btfids * .BTF_ids data gets embedded into vmlinux.unstripped in link-vmlinux.sh by objcopy --update-section For kernel modules, creating a special .bpf.o file is not necessary, and so embedding of sections data produced by resolve_btfids is straightforward with objcopy. With this patch an ELF file becomes effectively read-only within resolve_btfids, which allows deleting elf_update() call and satellite code (like compressed_section_fix [5]). Endianness handling of .BTF_ids data is also changed. Previously the "flags" part of the section was bswapped in sets_patch() [6], and then Elf_Type was modified before elf_update() to signal to libelf that bswap may be necessary. With this patch we explicitly bswap entire data buffer on load and on dump. [1] https://lore.kernel.org/bpf/131b4190-9c49-4f79-a99d-c00fac97fa44@linux.dev/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n110 [3] https://git.kernel.org/pub/scm/devel/pahole/pahole.git/tree/btf_encoder.c?h=v1.31#n1803 [4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n284 [5] https://lore.kernel.org/bpf/20200819092342.259004-1-jolsa@kernel.org/ [6] https://lore.kernel.org/bpf/cover.1707223196.git.vmalik@redhat.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181825.1289460-3-ihor.solodrai@linux.dev --- MAINTAINERS | 1 + scripts/Makefile.btf | 12 +- scripts/Makefile.modfinal | 5 +- scripts/Makefile.vmlinux | 2 +- scripts/gen-btf.sh | 157 +++++++++++++++ scripts/link-vmlinux.sh | 42 +--- tools/bpf/resolve_btfids/main.c | 224 +++++++++++++-------- tools/testing/selftests/bpf/.gitignore | 3 + tools/testing/selftests/bpf/Makefile | 9 +- .../selftests/bpf/prog_tests/resolve_btfids.c | 4 +- 10 files changed, 328 insertions(+), 131 deletions(-) create mode 100755 scripts/gen-btf.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 5b11839cba9d..cb1898a85b05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4766,6 +4766,7 @@ F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ F: scripts/bpf_doc.py +F: scripts/gen-btf.sh F: scripts/Makefile.btf F: scripts/pahole-version.sh F: tools/bpf/ diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index 840a55de42da..562a04b40e06 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -18,13 +18,15 @@ pahole-flags-$(call test-ge, $(pahole-ver), 126) = -j$(JOBS) --btf_features=enc pahole-flags-$(call test-ge, $(pahole-ver), 130) += --btf_features=attributes -ifneq ($(KBUILD_EXTMOD),) -module-pahole-flags-$(call test-ge, $(pahole-ver), 128) += --btf_features=distilled_base -endif - endif pahole-flags-$(CONFIG_PAHOLE_HAS_LANG_EXCLUDE) += --lang_exclude=rust export PAHOLE_FLAGS := $(pahole-flags-y) -export MODULE_PAHOLE_FLAGS := $(module-pahole-flags-y) + +resolve-btfids-flags-y := +resolve-btfids-flags-$(CONFIG_WERROR) += --fatal_warnings +resolve-btfids-flags-$(if $(KBUILD_EXTMOD),y) += --distill_base +resolve-btfids-flags-$(if $(KBUILD_VERBOSE),y) += --verbose + +export RESOLVE_BTFIDS_FLAGS := $(resolve-btfids-flags-y) diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 149e12ff5700..422c56dc878e 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -42,9 +42,8 @@ quiet_cmd_btf_ko = BTF [M] $@ cmd_btf_ko = \ if [ ! -f $(objtree)/vmlinux ]; then \ printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ - else \ - LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) $(MODULE_PAHOLE_FLAGS) --btf_base $(objtree)/vmlinux $@; \ - $(RESOLVE_BTFIDS) -b $(objtree)/vmlinux $@; \ + else \ + $(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \ fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index cd788cac9d91..20a988f4fe0c 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -71,7 +71,7 @@ targets += vmlinux.unstripped .vmlinux.export.o vmlinux.unstripped: scripts/link-vmlinux.sh vmlinux.o .vmlinux.export.o $(KBUILD_LDS) FORCE +$(call if_changed_dep,link_vmlinux) ifdef CONFIG_DEBUG_INFO_BTF -vmlinux.unstripped: $(RESOLVE_BTFIDS) +vmlinux.unstripped: $(RESOLVE_BTFIDS) $(srctree)/scripts/gen-btf.sh endif ifdef CONFIG_BUILDTIME_TABLE_SORT diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh new file mode 100755 index 000000000000..06c6d8becaa2 --- /dev/null +++ b/scripts/gen-btf.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2025 Meta Platforms, Inc. and affiliates. +# +# This script generates BTF data for the provided ELF file. +# +# Kernel BTF generation involves these conceptual steps: +# 1. pahole generates BTF from DWARF data +# 2. resolve_btfids applies kernel-specific btf2btf +# transformations and computes data for .BTF_ids section +# 3. the result gets linked/objcopied into the target binary +# +# How step (3) should be done differs between vmlinux, and +# kernel modules, which is the primary reason for the existence +# of this script. +# +# For modules the script expects vmlinux passed in as --btf_base. +# Generated .BTF, .BTF.base and .BTF_ids sections become embedded +# into the input ELF file with objcopy. +# +# For vmlinux the input file remains unchanged and two files are produced: +# - ${1}.btf.o ready for linking into vmlinux +# - ${1}.BTF_ids with .BTF_ids data blob +# This output is consumed by scripts/link-vmlinux.sh + +set -e + +usage() +{ + echo "Usage: $0 [--btf_base ] " + exit 1 +} + +BTF_BASE="" + +while [ $# -gt 0 ]; do + case "$1" in + --btf_base) + BTF_BASE="$2" + shift 2 + ;; + -*) + echo "Unknown option: $1" >&2 + usage + ;; + *) + break + ;; + esac +done + +if [ $# -ne 1 ]; then + usage +fi + +ELF_FILE="$1" +shift + +is_enabled() { + grep -q "^$1=y" ${objtree}/include/config/auto.conf +} + +info() +{ + printf " %-7s %s\n" "${1}" "${2}" +} + +case "${KBUILD_VERBOSE}" in +*1*) + set -x + ;; +esac + + +gen_btf_data() +{ + info BTF "${ELF_FILE}" + btf1="${ELF_FILE}.BTF.1" + ${PAHOLE} -J ${PAHOLE_FLAGS} \ + ${BTF_BASE:+--btf_base ${BTF_BASE}} \ + --btf_encode_detached=${btf1} \ + "${ELF_FILE}" + + info BTFIDS "${ELF_FILE}" + ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_FLAGS} \ + ${BTF_BASE:+--btf_base ${BTF_BASE}} \ + --btf ${btf1} "${ELF_FILE}" +} + +gen_btf_o() +{ + local btf_data=${ELF_FILE}.btf.o + + # Create ${btf_data} which contains just .BTF section but no symbols. Add + # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all + # deletes all symbols including __start_BTF and __stop_BTF, which will + # be redefined in the linker script. + info OBJCOPY "${btf_data}" + echo "" | ${CC} ${CLANG_FLAGS} -c -x c -o ${btf_data} - + ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \ + --set-section-flags .BTF=alloc,readonly ${btf_data} + ${OBJCOPY} --only-section=.BTF --strip-all ${btf_data} + + # Change e_type to ET_REL so that it can be used to link final vmlinux. + # GNU ld 2.35+ and lld do not allow an ET_EXEC input. + if is_enabled CONFIG_CPU_BIG_ENDIAN; then + et_rel='\0\1' + else + et_rel='\1\0' + fi + printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none +} + +embed_btf_data() +{ + info OBJCOPY "${ELF_FILE}.BTF" + ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE} + + # a module might not have a .BTF_ids or .BTF.base section + local btf_base="${ELF_FILE}.BTF.base" + if [ -f "${btf_base}" ]; then + ${OBJCOPY} --add-section .BTF.base=${btf_base} ${ELF_FILE} + fi + local btf_ids="${ELF_FILE}.BTF_ids" + if [ -f "${btf_ids}" ]; then + ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE} + fi +} + +cleanup() +{ + rm -f "${ELF_FILE}.BTF.1" + rm -f "${ELF_FILE}.BTF" + if [ "${BTFGEN_MODE}" = "module" ]; then + rm -f "${ELF_FILE}.BTF.base" + rm -f "${ELF_FILE}.BTF_ids" + fi +} +trap cleanup EXIT + +BTFGEN_MODE="vmlinux" +if [ -n "${BTF_BASE}" ]; then + BTFGEN_MODE="module" +fi + +gen_btf_data + +case "${BTFGEN_MODE}" in +vmlinux) + gen_btf_o + ;; +module) + embed_btf_data + ;; +esac + +exit 0 diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4ab44c73da4d..e2207e612ac3 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -106,34 +106,6 @@ vmlinux_link() ${kallsymso} ${btf_vmlinux_bin_o} ${arch_vmlinux_o} ${ldlibs} } -# generate .BTF typeinfo from DWARF debuginfo -# ${1} - vmlinux image -gen_btf() -{ - local btf_data=${1}.btf.o - - info BTF "${btf_data}" - LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1} - - # Create ${btf_data} which contains just .BTF section but no symbols. Add - # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all - # deletes all symbols including __start_BTF and __stop_BTF, which will - # be redefined in the linker script. Add 2>/dev/null to suppress GNU - # objcopy warnings: "empty loadable segment detected at ..." - ${OBJCOPY} --only-section=.BTF --set-section-flags .BTF=alloc,readonly \ - --strip-all ${1} "${btf_data}" 2>/dev/null - # Change e_type to ET_REL so that it can be used to link final vmlinux. - # GNU ld 2.35+ and lld do not allow an ET_EXEC input. - if is_enabled CONFIG_CPU_BIG_ENDIAN; then - et_rel='\0\1' - else - et_rel='\1\0' - fi - printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none - - btf_vmlinux_bin_o=${btf_data} -} - # Create ${2}.o file with all symbols from the ${1} object file kallsyms() { @@ -205,6 +177,7 @@ if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then fi btf_vmlinux_bin_o= +btfids_vmlinux= kallsymso= strip_debug= generate_map= @@ -232,11 +205,13 @@ if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then fi if is_enabled CONFIG_DEBUG_INFO_BTF; then - if ! gen_btf .tmp_vmlinux1; then + if ! ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then echo >&2 "Failed to generate BTF for vmlinux" echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF" exit 1 fi + btf_vmlinux_bin_o=.tmp_vmlinux1.btf.o + btfids_vmlinux=.tmp_vmlinux1.BTF_ids fi if is_enabled CONFIG_KALLSYMS; then @@ -289,14 +264,9 @@ fi vmlinux_link "${VMLINUX}" -# fill in BTF IDs if is_enabled CONFIG_DEBUG_INFO_BTF; then - info BTFIDS "${VMLINUX}" - RESOLVE_BTFIDS_ARGS="" - if is_enabled CONFIG_WERROR; then - RESOLVE_BTFIDS_ARGS=" --fatal_warnings " - fi - ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_ARGS} "${VMLINUX}" + info OBJCOPY ${btfids_vmlinux} + ${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX} fi mksysmap "${VMLINUX}" System.map diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index e721e20a2bbd..2cbc252259be 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -71,9 +71,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -124,6 +126,7 @@ struct object { struct btf *btf; struct btf *base_btf; + bool distill_base; struct { int fd; @@ -324,42 +327,16 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, BTF_ID_KIND_SYM); } -/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ -#ifndef SHF_COMPRESSED -#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */ -#endif - -/* - * The data of compressed section should be aligned to 4 - * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld - * sets sh_addralign to 1, which makes libelf fail with - * misaligned section error during the update: - * FAILED elf_update(WRITE): invalid section alignment - * - * While waiting for ld fix, we fix the compressed sections - * sh_addralign value manualy. - */ -static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh) +static void bswap_32_data(void *data, u32 nr_bytes) { - int expected = gelf_getclass(elf) == ELFCLASS32 ? 4 : 8; - - if (!(sh->sh_flags & SHF_COMPRESSED)) - return 0; - - if (sh->sh_addralign == expected) - return 0; + u32 cnt, i; + u32 *ptr; - pr_debug2(" - fixing wrong alignment sh_addralign %u, expected %u\n", - sh->sh_addralign, expected); + cnt = nr_bytes / sizeof(u32); + ptr = data; - sh->sh_addralign = expected; - - if (gelf_update_shdr(scn, sh) == 0) { - pr_err("FAILED cannot update section header: %s\n", - elf_errmsg(-1)); - return -1; - } - return 0; + for (i = 0; i < cnt; i++) + ptr[i] = bswap_32(ptr[i]); } static int elf_collect(struct object *obj) @@ -380,7 +357,7 @@ static int elf_collect(struct object *obj) elf_version(EV_CURRENT); - elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL); if (!elf) { close(fd); pr_err("FAILED cannot create ELF descriptor: %s\n", @@ -443,21 +420,20 @@ static int elf_collect(struct object *obj) obj->efile.symbols_shndx = idx; obj->efile.strtabidx = sh.sh_link; } else if (!strcmp(name, BTF_IDS_SECTION)) { + /* + * If target endianness differs from host, we need to bswap32 + * the .BTF_ids section data on load, because .BTF_ids has + * Elf_Type = ELF_T_BYTE, and so libelf returns data buffer in + * the target endianness. We repeat this on dump. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from target to host endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } obj->efile.idlist = data; obj->efile.idlist_shndx = idx; obj->efile.idlist_addr = sh.sh_addr; - } else if (!strcmp(name, BTF_BASE_ELF_SEC)) { - /* If a .BTF.base section is found, do not resolve - * BTF ids relative to vmlinux; resolve relative - * to the .BTF.base section instead. btf__parse_split() - * will take care of this once the base BTF it is - * passed is NULL. - */ - obj->base_btf_path = NULL; } - - if (compressed_section_fix(elf, scn, &sh)) - return -1; } return 0; @@ -587,11 +563,26 @@ static int load_btf(struct object *obj) obj->base_btf = base_btf; obj->btf = btf; + if (obj->base_btf && obj->distill_base) { + err = btf__distill_base(obj->btf, &base_btf, &btf); + if (err) { + pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); + goto out_err; + } + + btf__free(obj->base_btf); + btf__free(obj->btf); + obj->base_btf = base_btf; + obj->btf = btf; + } + return 0; out_err: btf__free(base_btf); btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; return err; } @@ -760,24 +751,6 @@ static int sets_patch(struct object *obj) */ BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); - - /* - * When ELF endianness does not match endianness of the - * host, libelf will do the translation when updating - * the ELF. This, however, corrupts SET8 flags which are - * already in the target endianness. So, let's bswap - * them to the host endianness and libelf will then - * correctly translate everything. - */ - if (obj->efile.encoding != ELFDATANATIVE) { - int i; - - set8->flags = bswap_32(set8->flags); - for (i = 0; i < set8->cnt; i++) { - set8->pairs[i].flags = - bswap_32(set8->pairs[i].flags); - } - } break; default: pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name); @@ -793,8 +766,6 @@ static int sets_patch(struct object *obj) static int symbols_patch(struct object *obj) { - off_t err; - if (__symbols_patch(obj, &obj->structs) || __symbols_patch(obj, &obj->unions) || __symbols_patch(obj, &obj->typedefs) || @@ -805,20 +776,90 @@ static int symbols_patch(struct object *obj) if (sets_patch(obj)) return -1; - /* Set type to ensure endian translation occurs. */ - obj->efile.idlist->d_type = ELF_T_WORD; + return 0; +} - elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY); +static int dump_raw_data(const char *out_path, const void *data, u32 size) +{ + size_t written; + FILE *file; - err = elf_update(obj->efile.elf, ELF_C_WRITE); - if (err < 0) { - pr_err("FAILED elf_update(WRITE): %s\n", - elf_errmsg(-1)); + file = fopen(out_path, "wb"); + if (!file) { + pr_err("Couldn't open %s for writing\n", out_path); + return -1; + } + + written = fwrite(data, 1, size, file); + if (written != size) { + pr_err("Failed to write data to %s\n", out_path); + fclose(file); + unlink(out_path); + return -1; + } + + fclose(file); + pr_debug("Dumped %lu bytes of data to %s\n", size, out_path); + + return 0; +} + +static int dump_raw_btf_ids(struct object *obj, const char *out_path) +{ + Elf_Data *data = obj->efile.idlist; + int err; + + if (!data || !data->d_buf) { + pr_debug("%s has no BTF_ids data to dump\n", obj->path); + return 0; + } + + /* + * If target endianness differs from host, we need to bswap32 the + * .BTF_ids section data before dumping so that the output is in + * target endianness. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from host to target endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } + + err = dump_raw_data(out_path, data->d_buf, data->d_size); + if (err) + return -1; + + return 0; +} + +static int dump_raw_btf(struct btf *btf, const char *out_path) +{ + const void *raw_btf_data; + u32 raw_btf_size; + int err; + + raw_btf_data = btf__raw_data(btf, &raw_btf_size); + if (!raw_btf_data) { + pr_err("btf__raw_data() failed\n"); + return -1; } - pr_debug("update %s for %s\n", - err >= 0 ? "ok" : "failed", obj->path); - return err < 0 ? -1 : 0; + err = dump_raw_data(out_path, raw_btf_data, raw_btf_size); + if (err) + return -1; + + return 0; +} + +static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) +{ + int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); + + if (len < 0 || len >= buf_sz) { + pr_err("Output path is too long: %s%s\n", in_path, suffix); + return -E2BIG; + } + + return 0; } static const char * const resolve_btfids_usage[] = { @@ -840,6 +881,8 @@ int main(int argc, const char **argv) .sets = RB_ROOT, }; bool fatal_warnings = false; + char out_path[PATH_MAX]; + struct option btfid_options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show errors, etc)"), @@ -849,6 +892,8 @@ int main(int argc, const char **argv) "path of file providing base BTF"), OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings, "turn warnings into errors"), + OPT_BOOLEAN(0, "distill_base", &obj.distill_base, + "distill --btf_base and emit .BTF.base section data"), OPT_END() }; int err = -1; @@ -860,6 +905,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (load_btf(&obj)) + goto out; + if (elf_collect(&obj)) goto out; @@ -869,23 +917,37 @@ int main(int argc, const char **argv) */ if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { - pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n"); - err = 0; - goto out; + pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n"); + goto dump_btf; } if (symbols_collect(&obj)) goto out; - if (load_btf(&obj)) - goto out; - if (symbols_resolve(&obj)) goto out; if (symbols_patch(&obj)) goto out; + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_IDS_SECTION); + err = err ?: dump_raw_btf_ids(&obj, out_path); + if (err) + goto out; + +dump_btf: + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_ELF_SEC); + err = err ?: dump_raw_btf(obj.btf, out_path); + if (err) + goto out; + + if (obj.base_btf && obj.distill_base) { + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_BASE_ELF_SEC); + err = err ?: dump_raw_btf(obj.base_btf, out_path); + if (err) + goto out; + } + if (!(fatal_warnings && warnings)) err = 0; out: diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 19c1638e312a..b8bf51b7a0b0 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -45,3 +45,6 @@ xdp_synproxy xdp_hw_metadata xdp_features verification_cert.h +*.BTF +*.BTF_ids +*.BTF.base diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ffd0a4c354c7..f28a32b16ff0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -4,6 +4,7 @@ include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include CXX ?= $(CROSS_COMPILE)g++ +OBJCOPY ?= $(CROSS_COMPILE)objcopy CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) @@ -653,9 +654,10 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) - $$(if $$(TEST_NEEDS_BTFIDS), \ - $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ - $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ + $(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -894,6 +896,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool $(TEST_KMOD_TARGETS) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ + *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ $(OUTPUT)/FEATURE-DUMP.selftests \ diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index 51544372f52e..41dfaaabb73f 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -101,9 +101,9 @@ static int resolve_symbols(void) int type_id; __u32 nr; - btf = btf__parse_elf("btf_data.bpf.o", NULL); + btf = btf__parse_raw("resolve_btfids.test.o.BTF"); if (CHECK(libbpf_get_error(btf), "resolve", - "Failed to load BTF from btf_data.bpf.o\n")) + "Failed to load BTF from resolve_btfids.test.o.BTF\n")) return -1; nr = btf__type_cnt(btf); -- cgit v1.2.3 From d2749ae85aec685e52e0474f445f6a8552363eb0 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 16 Dec 2025 13:30:00 +0000 Subject: selftests/bpf: add test case for BPF LSM hook bpf_lsm_mmap_file Add a trivial test case asserting that the BPF verifier enforces PTR_MAYBE_NULL semantics on the struct file pointer argument of BPF LSM hook bpf_lsm_mmap_file(). Dereferencing the struct file pointer passed into bpf_lsm_mmap_file() without explicitly performing a NULL check first should not be permitted by the BPF verifier as it can lead to NULL pointer dereferences and a kernel crash. Signed-off-by: Matt Bobrowski Acked-by: Song Liu Link: https://lore.kernel.org/r/20251216133000.3690723-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_lsm.c | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 6af9100a37ff..38e8e9176862 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include +#include #include "bpf_misc.h" SEC("lsm/file_permission") @@ -159,4 +160,32 @@ __naked int disabled_hook_test3(void *ctx) ::: __clobber_all); } +SEC("lsm/mmap_file") +__description("not null checking nullable pointer in bpf_lsm_mmap_file") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(no_null_check, struct file *file) +{ + struct inode *inode; + + inode = file->f_inode; + __sink(inode); + + return 0; +} + +SEC("lsm/mmap_file") +__description("null checking nullable pointer in bpf_lsm_mmap_file") +__success +int BPF_PROG(null_check, struct file *file) +{ + struct inode *inode; + + if (file) { + inode = file->f_inode; + __sink(inode); + } + + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 1045ec382c6019b63cab24428783749a1cecc439 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Dec 2025 15:26:12 +0100 Subject: kernel-doc: add support for handling global variables Specially on kAPI, sometimes it is desirable to be able to describe global variables that are part of kAPI. Documenting vars with Sphinx is simple, as we don't need to parse a data struct. All we need is the variable declaration and use native C domain ::c:var: to format it for us. Add support for it. Link: https://lore.kernel.org/linux-doc/491c3022-cef8-4860-a945-c9c4a3b63c09@infradead.org/T/#m947c25d95cb1d96a394410ab1131dc8e9e5013f1 Suggested-by: Randy Dunlap Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: --- tools/lib/python/kdoc/kdoc_output.py | 47 ++++++++++++++++++++++++++++++ tools/lib/python/kdoc/kdoc_parser.py | 56 +++++++++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py index b1aaa7fc3604..50aedbb3d6de 100644 --- a/tools/lib/python/kdoc/kdoc_output.py +++ b/tools/lib/python/kdoc/kdoc_output.py @@ -199,6 +199,10 @@ class OutputFormat: self.out_enum(fname, name, args) return self.data + if dtype == "var": + self.out_var(fname, name, args) + return self.data + if dtype == "typedef": self.out_typedef(fname, name, args) return self.data @@ -227,6 +231,9 @@ class OutputFormat: def out_enum(self, fname, name, args): """Outputs an enum""" + def out_var(self, fname, name, args): + """Outputs a variable""" + def out_typedef(self, fname, name, args): """Outputs a typedef""" @@ -472,6 +479,25 @@ class RestFormat(OutputFormat): self.lineprefix = oldprefix self.out_section(args) + def out_var(self, fname, name, args): + oldprefix = self.lineprefix + ln = args.declaration_start_line + full_proto = args.other_stuff["full_proto"] + + self.lineprefix = " " + + self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}{full_proto}\n\n" + + self.print_lineno(ln) + self.output_highlight(args.get('purpose', '')) + self.data += "\n" + + if args.other_stuff["default_val"]: + self.data += f'{self.lineprefix}**Initialization**\n\n' + self.output_highlight(f'default: ``{args.other_stuff["default_val"]}``') + + self.out_section(args) + def out_typedef(self, fname, name, args): oldprefix = self.lineprefix @@ -773,6 +799,27 @@ class ManFormat(OutputFormat): self.data += f'.SH "{section}"' + "\n" self.output_highlight(text) + def out_var(self, fname, name, args): + out_name = self.arg_name(args, name) + prototype = args.other_stuff["var_type"] + full_proto = args.other_stuff["full_proto"] + + self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" + + self.data += ".SH NAME\n" + self.data += f"{prototype} \\- {args['purpose']}\n" + + self.data += ".SH SYNOPSIS\n" + self.data += f"{full_proto}\n" + + if args.other_stuff["default_val"]: + self.data += f'.SH "Initialization"' + "\n" + self.output_highlight(f'default: {args.other_stuff["default_val"]}') + + for section, text in args.sections.items(): + self.data += f'.SH "{section}"' + "\n" + self.output_highlight(text) + def out_typedef(self, fname, name, args): module = self.modulename purpose = args.get('purpose') diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index 500aafc50032..06bed1a12a45 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -64,7 +64,7 @@ type_param = KernRe(r"@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False) # Tests for the beginning of a kerneldoc block in its various forms. # doc_block = doc_com + KernRe(r'DOC:\s*(.*)?', cache=False) -doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef)\b\s*(\w*)", cache = False) +doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef|var)\b\s*(\w*)", cache = False) doc_begin_func = KernRe(str(doc_com) + # initial " * ' r"(?:\w+\s*\*\s*)?" + # type (not captured) r'(?:define\s+)?' + # possible "define" (not captured) @@ -927,6 +927,58 @@ class KernelDoc: self.output_declaration('enum', declaration_name, purpose=self.entry.declaration_purpose) + def dump_var(self, ln, proto): + """ + Store variables that are part of kAPI. + """ + VAR_ATTRIBS = [ + "extern", + ] + OPTIONAL_VAR_ATTR = "^(?:" + "|".join(VAR_ATTRIBS) + ")?" + + sub_prefixes = [ + (KernRe(r"__read_mostly"), ""), + (KernRe(r"__ro_after_init"), ""), + (KernRe(r"(?://.*)$"), ""), + (KernRe(r"(?:/\*.*\*/)"), ""), + (KernRe(r";$"), ""), + (KernRe(r"=.*"), ""), + ] + + # + # Store the full prototype before modifying it + # + full_proto = proto + + # + # Drop comments and macros to have a pure C prototype + # + for search, sub in sub_prefixes: + proto = search.sub(sub, proto) + + proto = proto.rstrip() + + # + # Variable name is at the end of the declaration + # + + r= KernRe(OPTIONAL_VAR_ATTR + r"\w.*\s+(?:\*+)?([\w_]+)\s*[\d\]\[]*\s*(=.*)?") + if not r.match(proto): + self.emit_msg(ln,f"{proto}: can't parse variable") + return + + var_type = r.group(0) + declaration_name = r.group(1) + default_val = r.group(2) + if default_val: + default_val = default_val.lstrip("=").strip() + + self.output_declaration("var", declaration_name, + full_proto=full_proto, + var_type=var_type, + default_val=default_val, + purpose=self.entry.declaration_purpose) + def dump_declaration(self, ln, prototype): """ Stores a data declaration inside self.entries array. @@ -938,6 +990,8 @@ class KernelDoc: self.dump_typedef(ln, prototype) elif self.entry.decl_type in ["union", "struct"]: self.dump_struct(ln, prototype) + elif self.entry.decl_type == "var": + self.dump_var(ln, prototype) else: # This would be a bug self.emit_message(ln, f'Unknown declaration type: {self.entry.decl_type}') -- cgit v1.2.3 From bdd1cf87847ff6aaadd53a185209d2bb2db72165 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Dec 2025 15:26:13 +0100 Subject: kernel-doc: add support to handle DEFINE_ variables Improve the parser and output plugin to work with macros, adding support for the common pattern of using DEFINE_* to create variables. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <757a45100cfc493984574ff780aa9d90506eecb4.1765894964.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/kdoc_output.py | 5 ++--- tools/lib/python/kdoc/kdoc_parser.py | 25 +++++++++++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py index 50aedbb3d6de..d2bf94275d65 100644 --- a/tools/lib/python/kdoc/kdoc_output.py +++ b/tools/lib/python/kdoc/kdoc_output.py @@ -486,7 +486,7 @@ class RestFormat(OutputFormat): self.lineprefix = " " - self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}{full_proto}\n\n" + self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}``{full_proto}``\n\n" self.print_lineno(ln) self.output_highlight(args.get('purpose', '')) @@ -801,13 +801,12 @@ class ManFormat(OutputFormat): def out_var(self, fname, name, args): out_name = self.arg_name(args, name) - prototype = args.other_stuff["var_type"] full_proto = args.other_stuff["full_proto"] self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" self.data += ".SH NAME\n" - self.data += f"{prototype} \\- {args['purpose']}\n" + self.data += f"{name} \\- {args['purpose']}\n" self.data += ".SH SYNOPSIS\n" self.data += f"{full_proto}\n" diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index 06bed1a12a45..aaa352855717 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -949,12 +949,27 @@ class KernelDoc: # Store the full prototype before modifying it # full_proto = proto + declaration_name = None + + # + # Handle macro definitions + # + macro_prefixes = [ + KernRe(r"DEFINE_[\w_]+\s*\(([\w_]+)\)"), + ] + + for r in macro_prefixes: + match = r.search(proto) + if match: + declaration_name = match.group(1) + break # # Drop comments and macros to have a pure C prototype # - for search, sub in sub_prefixes: - proto = search.sub(sub, proto) + if not declaration_name: + for r, sub in sub_prefixes: + proto = r.sub(sub, proto) proto = proto.rstrip() @@ -968,14 +983,16 @@ class KernelDoc: return var_type = r.group(0) - declaration_name = r.group(1) + + if not declaration_name: + declaration_name = r.group(1) + default_val = r.group(2) if default_val: default_val = default_val.lstrip("=").strip() self.output_declaration("var", declaration_name, full_proto=full_proto, - var_type=var_type, default_val=default_val, purpose=self.entry.declaration_purpose) -- cgit v1.2.3 From aaacd70fb77afe75075e8bdf8e493b0af42eeabd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Dec 2025 15:26:17 +0100 Subject: docs: kernel-doc.rst: Parse DEFINE_ macros without prefixes Currently, the logic for vars require a type DEFINE_foo(); where type is usually "static". Make the logic more generic. Reported-by: Randy Dunlap Closes: https://lore.kernel.org/linux-doc/e1dad7e4-a0ca-4be6-a33c-97b75175c12f@infradead.org/ Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: --- tools/lib/python/kdoc/kdoc_parser.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index aaa352855717..e137bd9a7dac 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -977,17 +977,23 @@ class KernelDoc: # Variable name is at the end of the declaration # + default_val = None + r= KernRe(OPTIONAL_VAR_ATTR + r"\w.*\s+(?:\*+)?([\w_]+)\s*[\d\]\[]*\s*(=.*)?") - if not r.match(proto): - self.emit_msg(ln,f"{proto}: can't parse variable") - return + if r.match(proto): + if not declaration_name: + declaration_name = r.group(1) - var_type = r.group(0) + default_val = r.group(2) + else: + r= KernRe(OPTIONAL_VAR_ATTR + r"(?:\w.*)?\s+(?:\*+)?(?:[\w_]+)\s*[\d\]\[]*\s*(=.*)?") + if r.match(proto): + default_val = r.group(1) if not declaration_name: - declaration_name = r.group(1) + self.emit_msg(ln,f"{proto}: can't parse variable") + return - default_val = r.group(2) if default_val: default_val = default_val.lstrip("=").strip() -- cgit v1.2.3 From 9dbbd32ecd7bbfa6d3fa150bf8de8bba25e8dab2 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 11 Dec 2025 10:48:49 +0000 Subject: kdoc: allow dots in inline @param names Inline kernel-doc blocks failed to parse tags containing dots (e.g. creator.process_name in panfrost_gem.h) because the @name regex only matched word characters. Modify the single-line pattern to match doc_inline_sect so it includes \. and parses the same as a multi-line comment. Signed-off-by: Steven Price Acked-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Jonathan Corbet Message-ID: <20251211104851.45330-1-steven.price@arm.com> --- tools/lib/python/kdoc/kdoc_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index e137bd9a7dac..a9a37519145d 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -53,7 +53,7 @@ doc_content = doc_com_body + KernRe(r'(.*)', cache=False) doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False) doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False) doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False) -doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False) +doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@\s*[\w][\w\.]*\s*):\s*(.*)\s*\*/\s*$', cache=False) export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False) export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False) -- cgit v1.2.3 From 6bce6ddbe634bbc6d21672b5bfdbb5ad0409bd8d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 22 Dec 2025 20:41:55 -0800 Subject: bpf: selftests: selftests for memcg stat kfuncs Add test coverage for the kfuncs that fetch memcg stats. Using some common stats, test scenarios ensuring that the given stat increases by some arbitrary amount. The stats selected cover the three categories represented by the enums: node_stat_item, memcg_stat_item, vm_event_item. Since only a subset of all stats are queried, use a static struct made up of fields for each stat. Write to the struct with the fetched values when the bpf program is invoked and read the fields in the user mode program for verification. Signed-off-by: JP Kobryn Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20251223044156.208250-6-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/cgroup_iter_memcg.h | 18 ++ .../selftests/bpf/prog_tests/cgroup_iter_memcg.c | 223 +++++++++++++++++++++ .../selftests/bpf/progs/cgroup_iter_memcg.c | 39 ++++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/bpf/cgroup_iter_memcg.h create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 000000000000..3f59b127943b --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 000000000000..a5afd16705f0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + char *path; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; + + /* + * Increase memcg file usage by creating and writing + * to a mapped file. + */ + fd = open(path, O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); + unlink(path); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 64 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = i - 1; i >= 0; i--) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 000000000000..59fb70a3cc50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = &cgrp->self; + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} -- cgit v1.2.3 From 83dd46ecb68ecc03cff23e68490ded5d40d79f66 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 05:32:46 -0800 Subject: selftests: bpf: fix tests with raw_tp calling kfuncs As the previous commit allowed raw_tp programs to call kfuncs, so of the selftests that were expected to fail will now succeed. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251222133250.1890587-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dynptr_fail.c | 2 +- tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index dda6a8dada82..8f2ae9640886 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -1465,7 +1465,7 @@ int xdp_invalid_data_slice2(struct xdp_md *xdp) } /* Only supported prog type can create skb-type dynptrs */ -SEC("?raw_tp") +SEC("?xdp") __failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed") int skb_invalid_ctx(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c index a509cad97e69..1fce7a7e8d03 100644 --- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -32,7 +32,7 @@ static void task_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(task_kfunc_raw_tp) { task_kfunc_load_test(); @@ -86,7 +86,7 @@ static void cgrp_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cgrp_kfunc_raw_tp) { cgrp_kfunc_load_test(); @@ -138,7 +138,7 @@ static void cpumask_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cpumask_kfunc_raw_tp) { cpumask_kfunc_load_test(); -- cgit v1.2.3 From efecc9e825f4aa3fe616236152604a066a3e776d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:19 -0800 Subject: selftests: bpf: test non-sleepable arena allocations As arena kfuncs can now be called from non-sleepable contexts, test this by adding non-sleepable copies of tests in verifier_arena, this is done by using a socket program instead of syscall. Add a new test case in verifier_arena_large to check that the bpf_arena_alloc_pages() works for more than 1024 pages. 1024 * sizeof(struct page *) is the upper limit of kmalloc_nolock() but bpf_arena_alloc_pages() should still succeed because it re-uses this array in a loop. Augment the arena_list selftest to also run in non-sleepable context by taking rcu_read_lock. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/arena_list.c | 20 ++- tools/testing/selftests/bpf/progs/arena_list.c | 11 ++ tools/testing/selftests/bpf/progs/verifier_arena.c | 185 +++++++++++++++++++++ .../selftests/bpf/progs/verifier_arena_large.c | 29 ++++ 4 files changed, 240 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c index d15867cddde0..4f2866a615ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_list.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -27,17 +27,23 @@ static int list_sum(struct arena_list_head *head) return sum; } -static void test_arena_list_add_del(int cnt) +static void test_arena_list_add_del(int cnt, bool nonsleepable) { LIBBPF_OPTS(bpf_test_run_opts, opts); struct arena_list *skel; int expected_sum = (u64)cnt * (cnt - 1) / 2; int ret, sum; - skel = arena_list__open_and_load(); - if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load")) + skel = arena_list__open(); + if (!ASSERT_OK_PTR(skel, "arena_list__open")) return; + skel->rodata->nonsleepable = nonsleepable; + + ret = arena_list__load(skel); + if (!ASSERT_OK(ret, "arena_list__load")) + goto out; + skel->bss->cnt = cnt; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); ASSERT_OK(ret, "ret_add"); @@ -65,7 +71,11 @@ out: void test_arena_list(void) { if (test__start_subtest("arena_list_1")) - test_arena_list_add_del(1); + test_arena_list_add_del(1, false); if (test__start_subtest("arena_list_1000")) - test_arena_list_add_del(1000); + test_arena_list_add_del(1000, false); + if (test__start_subtest("arena_list_1_nonsleepable")) + test_arena_list_add_del(1, true); + if (test__start_subtest("arena_list_1000_nonsleepable")) + test_arena_list_add_del(1000, true); } diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index 3a2ddcacbea6..235d8cc95bdd 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -30,6 +30,7 @@ struct arena_list_head __arena *list_head; int list_sum; int cnt; bool skip = false; +const volatile bool nonsleepable = false; #ifdef __BPF_FEATURE_ADDR_SPACE_CAST long __arena arena_sum; @@ -42,6 +43,9 @@ int test_val SEC(".addr_space.1"); int zero; +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + SEC("syscall") int arena_list_add(void *ctx) { @@ -71,6 +75,10 @@ int arena_list_del(void *ctx) struct elem __arena *n; int sum = 0; + /* Take rcu_read_lock to test non-sleepable context */ + if (nonsleepable) + bpf_rcu_read_lock(); + arena_sum = 0; list_for_each_entry(n, list_head, node) { sum += n->value; @@ -79,6 +87,9 @@ int arena_list_del(void *ctx) bpf_free(n); } list_sum = sum; + + if (nonsleepable) + bpf_rcu_read_unlock(); #else skip = true; #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 7f4827eede3c..4a9d96344813 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -21,6 +21,37 @@ struct { #endif } arena SEC(".maps"); +SEC("socket") +__success __retval(0) +int basic_alloc1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile int __arena *page1, *page2, *no_page; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page2, 1); + if (*page1 != 1) + return 6; + if (*page2 != 0 && *page2 != 2) /* use-after-free should return 0 or the stored value */ + return 7; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc1(void *ctx) @@ -60,6 +91,44 @@ int basic_alloc1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_alloc2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page1, *page2, *page3, *page4; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + if (!page1) + return 1; + page2 = page1 + __PAGE_SIZE; + page3 = page1 + __PAGE_SIZE * 2; + page4 = page1 - __PAGE_SIZE; + *page1 = 1; + *page2 = 2; + *page3 = 3; + *page4 = 4; + if (*page1 != 1) + return 1; + if (*page2 != 2) + return 2; + if (*page3 != 0) + return 3; + if (*page4 != 0) + return 4; + bpf_arena_free_pages(&arena, (void __arena *)page1, 2); + if (*page1 != 0 && *page1 != 1) + return 5; + if (*page2 != 0 && *page2 != 2) + return 6; + if (*page3 != 0) + return 7; + if (*page4 != 0) + return 8; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc2(void *ctx) @@ -102,6 +171,19 @@ struct bpf_arena___l { struct bpf_map map; } __attribute__((preserve_access_index)); +SEC("socket") +__success __retval(0) __log_level(2) +int basic_alloc3_nosleep(void *ctx) +{ + struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena; + volatile char __arena *pages; + + pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0); + if (!pages) + return 1; + return 0; +} + SEC("syscall") __success __retval(0) __log_level(2) int basic_alloc3(void *ctx) @@ -115,6 +197,38 @@ int basic_alloc3(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return 1; + + page += __PAGE_SIZE; + + /* Reserve the second page */ + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 2; + + /* Try to explicitly allocate the reserved page. */ + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if (page) + return 3; + + /* Try to implicitly allocate the page (since there's only 2 of them). */ + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (page) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve1(void *ctx) @@ -147,6 +261,26 @@ int basic_reserve1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if ((u64)page) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve2(void *ctx) @@ -168,6 +302,27 @@ int basic_reserve2(void *ctx) } /* Reserve the same page twice, should return -EBUSY. */ +SEC("socket") +__success __retval(0) +int reserve_twice_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret != -EBUSY) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_twice(void *ctx) @@ -190,6 +345,36 @@ int reserve_twice(void *ctx) } /* Try to reserve past the end of the arena. */ +SEC("socket") +__success __retval(0) +int reserve_invalid_region_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + /* Try a NULL pointer. */ + ret = bpf_arena_reserve_pages(&arena, NULL, 3); + if (ret != -EINVAL) + return 1; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 3); + if (ret != -EINVAL) + return 2; + + ret = bpf_arena_reserve_pages(&arena, page, 4096); + if (ret != -EINVAL) + return 3; + + ret = bpf_arena_reserve_pages(&arena, page, (1ULL << 32) - 1); + if (ret != -EINVAL) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_invalid_region(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 2b8cf2a4d880..4ca491cbe8d1 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -283,5 +283,34 @@ int big_alloc2(void *ctx) return 9; return 0; } + +SEC("socket") +__success __retval(0) +int big_alloc3(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *pages; + u64 i; + + /* + * Allocate 2051 pages in one go to check how kmalloc_nolock() handles large requests. + * Since kmalloc_nolock() can allocate up to 1024 struct page * at a time, this call should + * result in three batches: two batches of 1024 pages each, followed by a final batch of 3 + * pages. + */ + pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); + if (!pages) + return -1; + + bpf_for(i, 0, 2051) + pages[i * PAGE_SIZE] = 123; + bpf_for(i, 0, 2051) + if (pages[i * PAGE_SIZE] != 123) + return i; + + bpf_arena_free_pages(&arena, pages, 2051); +#endif + return 0; +} #endif char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bd09d9a05cf04028f639e209b416bacaeffd4909 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Mon, 27 Oct 2025 20:07:24 +0100 Subject: selftests/landlock: Fix TCP bind(AF_UNSPEC) test case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nominal error code for bind(AF_UNSPEC) on an IPv6 socket is -EAFNOSUPPORT, not -EINVAL. -EINVAL is only returned when the supplied address struct is too short, which happens to be the case in current selftests because they treat AF_UNSPEC like IPv4 sockets do: as an alias for AF_INET (which is a 16-byte struct instead of the 24 bytes required by IPv6 sockets). Make the union large enough for any address (by adding struct sockaddr_storage to the union), and make AF_UNSPEC addresses large enough for any family. Test for -EAFNOSUPPORT instead, and add a dedicated test case for truncated inputs with -EINVAL. Fixes: a549d055a22e ("selftests/landlock: Add network tests") Signed-off-by: Matthieu Buffet Link: https://lore.kernel.org/r/20251027190726.626244-2-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/common.h | 1 + tools/testing/selftests/landlock/net_test.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 230b75f6015b..90551650299c 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -237,6 +237,7 @@ struct service_fixture { struct sockaddr_un unix_addr; socklen_t unix_addr_len; }; + struct sockaddr_storage _largest; }; }; diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 2a45208551e6..3bbc0508420b 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -121,6 +121,10 @@ static socklen_t get_addrlen(const struct service_fixture *const srv, { switch (srv->protocol.domain) { case AF_UNSPEC: + if (minimal) + return sizeof(sa_family_t); + return sizeof(struct sockaddr_storage); + case AF_INET: return sizeof(srv->ipv4_addr); @@ -758,6 +762,11 @@ TEST_F(protocol, bind_unspec) bind_fd = socket_variant(&self->srv0); ASSERT_LE(0, bind_fd); + /* Tries to bind with too small addrlen. */ + EXPECT_EQ(-EINVAL, bind_variant_addrlen( + bind_fd, &self->unspec_any0, + get_addrlen(&self->unspec_any0, true) - 1)); + /* Allowed bind on AF_UNSPEC/INADDR_ANY. */ ret = bind_variant(bind_fd, &self->unspec_any0); if (variant->prot.domain == AF_INET) { @@ -766,6 +775,8 @@ TEST_F(protocol, bind_unspec) TH_LOG("Failed to bind to unspec/any socket: %s", strerror(errno)); } + } else if (variant->prot.domain == AF_INET6) { + EXPECT_EQ(-EAFNOSUPPORT, ret); } else { EXPECT_EQ(-EINVAL, ret); } @@ -792,6 +803,8 @@ TEST_F(protocol, bind_unspec) } else { EXPECT_EQ(0, ret); } + } else if (variant->prot.domain == AF_INET6) { + EXPECT_EQ(-EAFNOSUPPORT, ret); } else { EXPECT_EQ(-EINVAL, ret); } @@ -801,7 +814,8 @@ TEST_F(protocol, bind_unspec) bind_fd = socket_variant(&self->srv0); ASSERT_LE(0, bind_fd); ret = bind_variant(bind_fd, &self->unspec_srv0); - if (variant->prot.domain == AF_INET) { + if (variant->prot.domain == AF_INET || + variant->prot.domain == AF_INET6) { EXPECT_EQ(-EAFNOSUPPORT, ret); } else { EXPECT_EQ(-EINVAL, ret) -- cgit v1.2.3 From 6685201ebfacff0c889bcd569181fa6e8af5575e Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Mon, 27 Oct 2025 20:07:25 +0100 Subject: selftests/landlock: Add missing connect(minimal AF_UNSPEC) test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit connect_variant(unspec_any0) is called twice. Both calls end up in connect_variant_addrlen() with an address length of get_addrlen(minimal=false). However, the connect() syscall and its variants (e.g. iouring/compat) accept much shorter addresses of 4 bytes and that behaviour was not tested. Replace one of these calls with one using a minimal address length (just a bare sa_family=AF_UNSPEC field with no actual address). Also add a call using a truncated address for good measure. Signed-off-by: Matthieu Buffet Link: https://lore.kernel.org/r/20251027190726.626244-3-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/net_test.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 3bbc0508420b..b34b139b3f89 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -906,7 +906,19 @@ TEST_F(protocol, connect_unspec) EXPECT_EQ(0, close(ruleset_fd)); } - ret = connect_variant(connect_fd, &self->unspec_any0); + /* Try to re-disconnect with a truncated address struct. */ + EXPECT_EQ(-EINVAL, + connect_variant_addrlen( + connect_fd, &self->unspec_any0, + get_addrlen(&self->unspec_any0, true) - 1)); + + /* + * Re-disconnect, with a minimal sockaddr struct (just a + * bare af_family=AF_UNSPEC field). + */ + ret = connect_variant_addrlen(connect_fd, &self->unspec_any0, + get_addrlen(&self->unspec_any0, + true)); if (self->srv0.protocol.domain == AF_UNIX && self->srv0.protocol.type == SOCK_STREAM) { EXPECT_EQ(-EINVAL, ret); -- cgit v1.2.3 From e1a57c33590a50a6639798e60a597af4a23b0340 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Mon, 1 Dec 2025 01:36:31 +0100 Subject: selftests/landlock: Remove invalid unix socket bind() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove bind() call on a client socket that doesn't make sense. Since strlen(cli_un.sun_path) returns a random value depending on stack garbage, that many uninitialized bytes are read from the stack as an unix socket address. This creates random test failures due to the bind address being invalid or already in use if the same stack value comes up twice. Fixes: f83d51a5bdfe ("selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets") Signed-off-by: Matthieu Buffet Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20251201003631.190817-1-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index eee814e09dd7..7d378bdf3bce 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -4391,9 +4391,6 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, cli_fd); - size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); - ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size)); - bzero(&cli_un, sizeof(cli_un)); cli_un.sun_family = AF_UNIX; strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); -- cgit v1.2.3 From e4aa4461d4acb922ef45785581232f0588a6eea8 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Tue, 2 Dec 2025 22:51:41 +0100 Subject: selftests/landlock: NULL-terminate unix pathname addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The size of Unix pathname addresses is computed in selftests using offsetof(struct sockaddr_un, sun_path) + strlen(xxx). It should have been that +1, which makes addresses passed to the libc and kernel non-NULL-terminated. unix_mkname_bsd() fixes that in Linux so there is no harm, but just using sizeof(the address struct) should improve readability. Signed-off-by: Matthieu Buffet Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20251202215141.689986-1-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 24 ++++++++++------------ .../selftests/landlock/scoped_abstract_unix_test.c | 21 ++++++++----------- 2 files changed, 20 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 7d378bdf3bce..76491ba54dce 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -4362,22 +4362,24 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) { const char *const path = file1_s1d1; int srv_fd, cli_fd, ruleset_fd; - socklen_t size; - struct sockaddr_un srv_un, cli_un; + struct sockaddr_un srv_un = { + .sun_family = AF_UNIX, + }; + struct sockaddr_un cli_un = { + .sun_family = AF_UNIX, + }; const struct landlock_ruleset_attr attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, }; /* Sets up a server */ - srv_un.sun_family = AF_UNIX; - strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); - ASSERT_EQ(0, unlink(path)); srv_fd = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, srv_fd); - size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path); - ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size)); + strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); + ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, sizeof(srv_un))); + ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */)); /* Enables Landlock. */ @@ -4387,16 +4389,12 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) ASSERT_EQ(0, close(ruleset_fd)); /* Sets up a client connection to it */ - cli_un.sun_family = AF_UNIX; cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, cli_fd); - bzero(&cli_un, sizeof(cli_un)); - cli_un.sun_family = AF_UNIX; strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); - size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); - - ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size)); + ASSERT_EQ(0, + connect(cli_fd, (struct sockaddr *)&cli_un, sizeof(cli_un))); /* FIONREAD and other IOCTLs should not be forbidden. */ EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c index 6825082c079c..2cdf1ba07016 100644 --- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c +++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c @@ -779,7 +779,6 @@ FIXTURE_TEARDOWN(various_address_sockets) TEST_F(various_address_sockets, scoped_pathname_sockets) { - socklen_t size_stream, size_dgram; pid_t child; int status; char buf_child, buf_parent; @@ -798,12 +797,8 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) /* Pathname address. */ snprintf(stream_pathname_addr.sun_path, sizeof(stream_pathname_addr.sun_path), "%s", stream_path); - size_stream = offsetof(struct sockaddr_un, sun_path) + - strlen(stream_pathname_addr.sun_path); snprintf(dgram_pathname_addr.sun_path, sizeof(dgram_pathname_addr.sun_path), "%s", dgram_path); - size_dgram = offsetof(struct sockaddr_un, sun_path) + - strlen(dgram_pathname_addr.sun_path); /* Abstract address. */ memset(&stream_abstract_addr, 0, sizeof(stream_abstract_addr)); @@ -841,8 +836,9 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) /* Connects with pathname sockets. */ stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, stream_pathname_socket); - ASSERT_EQ(0, connect(stream_pathname_socket, - &stream_pathname_addr, size_stream)); + ASSERT_EQ(0, + connect(stream_pathname_socket, &stream_pathname_addr, + sizeof(stream_pathname_addr))); ASSERT_EQ(1, write(stream_pathname_socket, "b", 1)); EXPECT_EQ(0, close(stream_pathname_socket)); @@ -850,12 +846,13 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0); ASSERT_LE(0, dgram_pathname_socket); err = sendto(dgram_pathname_socket, "c", 1, 0, - &dgram_pathname_addr, size_dgram); + &dgram_pathname_addr, sizeof(dgram_pathname_addr)); EXPECT_EQ(1, err); /* Sends with connection. */ - ASSERT_EQ(0, connect(dgram_pathname_socket, - &dgram_pathname_addr, size_dgram)); + ASSERT_EQ(0, + connect(dgram_pathname_socket, &dgram_pathname_addr, + sizeof(dgram_pathname_addr))); ASSERT_EQ(1, write(dgram_pathname_socket, "d", 1)); EXPECT_EQ(0, close(dgram_pathname_socket)); @@ -910,13 +907,13 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, stream_pathname_socket); ASSERT_EQ(0, bind(stream_pathname_socket, &stream_pathname_addr, - size_stream)); + sizeof(stream_pathname_addr))); ASSERT_EQ(0, listen(stream_pathname_socket, backlog)); dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0); ASSERT_LE(0, dgram_pathname_socket); ASSERT_EQ(0, bind(dgram_pathname_socket, &dgram_pathname_addr, - size_dgram)); + sizeof(dgram_pathname_addr))); /* Sets up abstract servers. */ stream_abstract_socket = socket(AF_UNIX, SOCK_STREAM, 0); -- cgit v1.2.3 From 14c00e30d3a29a7fb6053fcaa54aeb6c07fb1055 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 28 Dec 2025 01:27:31 +0000 Subject: selftests/landlock: Fix typo in fs_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Tingmao Wang Link: https://lore.kernel.org/r/62d18e06eeb26f62bc49d24c4467b3793c5ba32b.1766885035.git.m@maowtm.org Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 76491ba54dce..37a5a3df712e 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -7069,8 +7069,8 @@ static int matches_log_fs_extra(struct __test_metadata *const _metadata, return -E2BIG; /* - * It is assume that absolute_path does not contain control characters nor - * spaces, see audit_string_contains_control(). + * It is assumed that absolute_path does not contain control + * characters nor spaces, see audit_string_contains_control(). */ absolute_path = realpath(path, NULL); if (!absolute_path) -- cgit v1.2.3 From 7aa593d8fb64b884bf00c13e01387b0733f3d786 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 28 Dec 2025 01:27:32 +0000 Subject: selftests/landlock: Fix missing semicolon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing semicolon after EXPECT_EQ(0, close(stream_server_child)) in the scoped_vs_unscoped test. I suspect currently it's just not executing the close statement after the line, but this causes no observable difference. Fixes: fefcf0f7cf47 ("selftests/landlock: Test abstract UNIX socket scoping") Cc: Tahera Fahimi Signed-off-by: Tingmao Wang Link: https://lore.kernel.org/r/d9e968e4cd4ecc9bf487593d7b7220bffbb3b5f5.1766885035.git.m@maowtm.org Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/scoped_abstract_unix_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c index 2cdf1ba07016..72f97648d4a7 100644 --- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c +++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c @@ -543,7 +543,7 @@ TEST_F(scoped_vs_unscoped, unix_scoping) ASSERT_EQ(1, write(pipe_child[1], ".", 1)); ASSERT_EQ(grand_child, waitpid(grand_child, &status, 0)); - EXPECT_EQ(0, close(stream_server_child)) + EXPECT_EQ(0, close(stream_server_child)); EXPECT_EQ(0, close(dgram_server_child)); return; } -- cgit v1.2.3 From 55dc93a7c2717311d48ca0a47c5f8c1b0755a115 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 28 Dec 2025 01:27:34 +0000 Subject: selftests/landlock: Use scoped_base_variants.h for ptrace_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ptrace_test.c currently contains a duplicated version of the scoped_domains fixture variants. This patch removes that and make it use the shared scoped_base_variants.h instead, like in scoped_abstract_unix_test and scoped_signal_test. This required renaming the hierarchy fixture to scoped_domains, but the test is otherwise the same. Cc: Tahera Fahimi Signed-off-by: Tingmao Wang Link: https://lore.kernel.org/r/48148f0134f95f819a25277486a875a6fd88ecf9.1766885035.git.m@maowtm.org Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/ptrace_test.c | 154 +-------------------- .../selftests/landlock/scoped_base_variants.h | 9 +- 2 files changed, 12 insertions(+), 151 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c index 4e356334ecb7..4f64c90583cd 100644 --- a/tools/testing/selftests/landlock/ptrace_test.c +++ b/tools/testing/selftests/landlock/ptrace_test.c @@ -86,16 +86,9 @@ static int get_yama_ptrace_scope(void) } /* clang-format off */ -FIXTURE(hierarchy) {}; +FIXTURE(scoped_domains) {}; /* clang-format on */ -FIXTURE_VARIANT(hierarchy) -{ - const bool domain_both; - const bool domain_parent; - const bool domain_child; -}; - /* * Test multiple tracing combinations between a parent process P1 and a child * process P2. @@ -104,155 +97,18 @@ FIXTURE_VARIANT(hierarchy) * restriction is enforced in addition to any Landlock check, which means that * all P2 requests to trace P1 would be denied. */ +#include "scoped_base_variants.h" -/* - * No domain - * - * P1-. P1 -> P2 : allow - * \ P2 -> P1 : allow - * 'P2 - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_without_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = false, - .domain_child = false, -}; - -/* - * Child domain - * - * P1--. P1 -> P2 : allow - * \ P2 -> P1 : deny - * .'-----. - * | P2 | - * '------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_with_one_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = false, - .domain_child = true, -}; - -/* - * Parent domain - * .------. - * | P1 --. P1 -> P2 : deny - * '------' \ P2 -> P1 : allow - * ' - * P2 - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_parent_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = true, - .domain_child = false, -}; - -/* - * Parent + child domain (siblings) - * .------. - * | P1 ---. P1 -> P2 : deny - * '------' \ P2 -> P1 : deny - * .---'--. - * | P2 | - * '------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_sibling_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = true, - .domain_child = true, -}; - -/* - * Same domain (inherited) - * .-------------. - * | P1----. | P1 -> P2 : allow - * | \ | P2 -> P1 : allow - * | ' | - * | P2 | - * '-------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_sibling_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = false, - .domain_child = false, -}; - -/* - * Inherited + child domain - * .-----------------. - * | P1----. | P1 -> P2 : allow - * | \ | P2 -> P1 : deny - * | .-'----. | - * | | P2 | | - * | '------' | - * '-----------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_with_nested_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = false, - .domain_child = true, -}; - -/* - * Inherited + parent domain - * .-----------------. - * |.------. | P1 -> P2 : deny - * || P1 ----. | P2 -> P1 : allow - * |'------' \ | - * | ' | - * | P2 | - * '-----------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_nested_and_parent_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = true, - .domain_child = false, -}; - -/* - * Inherited + parent and child domain (siblings) - * .-----------------. - * | .------. | P1 -> P2 : deny - * | | P1 . | P2 -> P1 : deny - * | '------'\ | - * | \ | - * | .--'---. | - * | | P2 | | - * | '------' | - * '-----------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_forked_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = true, - .domain_child = true, -}; - -FIXTURE_SETUP(hierarchy) +FIXTURE_SETUP(scoped_domains) { } -FIXTURE_TEARDOWN(hierarchy) +FIXTURE_TEARDOWN(scoped_domains) { } /* Test PTRACE_TRACEME and PTRACE_ATTACH for parent and child. */ -TEST_F(hierarchy, trace) +TEST_F(scoped_domains, trace) { pid_t child, parent; int status, err_proc_read; diff --git a/tools/testing/selftests/landlock/scoped_base_variants.h b/tools/testing/selftests/landlock/scoped_base_variants.h index d3b1fa8a584e..7116728ebc68 100644 --- a/tools/testing/selftests/landlock/scoped_base_variants.h +++ b/tools/testing/selftests/landlock/scoped_base_variants.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Landlock scoped_domains variants + * Landlock scoped_domains test variant definition. * - * See the hierarchy variants from ptrace_test.c + * This file defines a fixture variant "scoped_domains" that has all + * permutations of parent/child process being in separate or shared + * Landlock domain, or not being in a Landlock domain at all. + * + * Scoped access tests can include this file to avoid repeating these + * combinations. * * Copyright © 2017-2020 Mickaël Salaün * Copyright © 2019-2020 ANSSI -- cgit v1.2.3 From 317a5df78f24bd77fb770a26eb85bf39620592e0 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 30 Dec 2025 11:51:32 -0800 Subject: selftests/bpf: Fix verifier_arena_large/big_alloc3 test The big_alloc3() test tries to allocate 2051 pages at once in non-sleepable context and this can fail sporadically on resource contrained systems, so skip this test in case of such failures. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251230195134.599463-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_large.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 4ca491cbe8d1..5f7e7afee169 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -300,7 +300,7 @@ int big_alloc3(void *ctx) */ pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); if (!pages) - return -1; + return 0; bpf_for(i, 0, 2051) pages[i * PAGE_SIZE] = 123; -- cgit v1.2.3 From e6f2612f0e7c23ce991d3094b5387caf1a52a4fe Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 29 Dec 2025 23:13:08 -0800 Subject: selftests/bpf: test cases for bpf_loop SCC and state graph backedges Test for state graph backedges accumulation for SCCs formed by bpf_loop(). Equivalent to the following C program: int main(void) { 1: fp[-8] = bpf_get_prandom_u32(); 2: fp[-16] = -32; // used in a memory access below 3: bpf_loop(7, loop_cb4, fp, 0); 4: return 0; } int loop_cb4(int i, void *ctx) { 5: if (unlikely(ctx[-8] > bpf_get_prandom_u32())) 6: *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected 7: if (unlikely(fp[-8] > bpf_get_prandom_u32())) 8: ctx[-16] = -31; // makes said access unaligned 9: return 0; } If state graph backedges are not accumulated properly at the SCC formed by loop_cb4() call from bpf_loop(), the state {ctx[-16]=-32} injected at instruction 9 on verification path 1,2,3,5,7,9,4 would be considered fully verified and would lack precision mark for ctx[-16]. This would lead to early pruning of verification path 1,2,3,5,7,8,9 in state {ctx[-16]=-31}, which in turn leads to the incorrect assumption that the above program is safe. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251229-scc-for-callbacks-v1-2-ceadfe679900@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 75 +++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 7dd92a303bf6..69061f030957 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1926,4 +1926,79 @@ static int loop1_wrapper(void) ); } +/* + * This is similar to a test case absent_mark_in_the_middle_state(), + * but adapted for use with bpf_loop(). + */ +SEC("raw_tp") +__flag(BPF_F_TEST_STATE_FREQ) +__failure __msg("math between fp pointer and register with unbounded min value is not allowed") +__naked void absent_mark_in_the_middle_state4(void) +{ + /* + * Equivalent to a C program below: + * + * int main(void) { + * fp[-8] = bpf_get_prandom_u32(); + * fp[-16] = -32; // used in a memory access below + * bpf_loop(7, loop_cb4, fp, 0); + * return 0; + * } + * + * int loop_cb4(int i, void *ctx) { + * if (unlikely(ctx[-8] > bpf_get_prandom_u32())) + * *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected + * if (unlikely(fp[-8] > bpf_get_prandom_u32())) + * ctx[-16] = -31; // makes said access unaligned + * return 0; + * } + */ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r8 = r0;" + "*(u64 *)(r10 - 8) = r0;" + "*(u64 *)(r10 - 16) = -32;" + "r1 = 7;" + "r2 = loop_cb4 ll;" + "r3 = r10;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_loop), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +__used __naked +static void loop_cb4(void) +{ + asm volatile ( + "r9 = r2;" + "r8 = *(u64 *)(r9 - 8);" + "r6 = *(u64 *)(r9 - 16);" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto use_fp16_%=;" + "1:" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto update_fp16_%=;" + "2:" + "r0 = 0;" + "exit;" + "use_fp16_%=:" + "r1 = r10;" + "r1 += r6;" + "*(u64 *)(r1 + 0) = 42;" + "goto 1b;" + "update_fp16_%=:" + "*(u64 *)(r9 - 16) = -31;" + "goto 2b;" + : + : __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 4fd99103eef347174b3c9b6071428324a3cf9a60 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 30 Dec 2025 21:36:04 -0800 Subject: selftests/bpf: iterator based loop and STACK_MISC states pruning The test case first initializes 9 stack slots as STACK_MISC, then conditionally updates each of them to SCALAR spill inside an iterator based loop. This leads to 2**9 combinations of MISC/SPILL marks for these slots at the iterator next call. The loop converges only if the verifier treats such states as equivalent, otherwise visited states are evicted from the states cache too quickly. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251230-loop-stack-misc-pruning-v1-2-585cfd6cec51@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 65 +++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 69061f030957..7f27b517d5d5 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1997,6 +1997,71 @@ static void loop_cb4(void) "goto 2b;" : : __imm(bpf_get_prandom_u32) + ); +} + +SEC("raw_tp") +__success +__naked int stack_misc_vs_scalar_in_a_loop(void) +{ + asm volatile( + "*(u8 *)(r10 - 15) = 1;" /* This marks stack slot fp[-16] as STACK_MISC. */ + "*(u8 *)(r10 - 23) = 1;" + "*(u8 *)(r10 - 31) = 1;" + "*(u8 *)(r10 - 39) = 1;" + "*(u8 *)(r10 - 47) = 1;" + "*(u8 *)(r10 - 55) = 1;" + "*(u8 *)(r10 - 63) = 1;" + "*(u8 *)(r10 - 71) = 1;" + "*(u8 *)(r10 - 79) = 1;" + "r1 = r10;" + "r1 += -8;" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "loop_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto loop_end_%=;" + +#define maybe_change_stack_slot(off) \ + "call %[bpf_get_prandom_u32];" \ + "if r0 == 42 goto +1;" \ + "goto +1;" \ + "*(u64 *)(r10 " #off ") = r0;" + + /* + * When comparing verifier states fp[-16] will be + * either STACK_MISC or SCALAR. Pruning logic should + * consider old STACK_MISC equivalent to current SCALAR + * to avoid states explosion. + */ + maybe_change_stack_slot(-16) + maybe_change_stack_slot(-24) + maybe_change_stack_slot(-32) + maybe_change_stack_slot(-40) + maybe_change_stack_slot(-48) + maybe_change_stack_slot(-56) + maybe_change_stack_slot(-64) + maybe_change_stack_slot(-72) + maybe_change_stack_slot(-80) + +#undef maybe_change_stack_slot + + "goto loop_%=;" + "loop_end_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_destroy];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_iter_num_new), + __imm(bpf_iter_num_next), + __imm(bpf_iter_num_destroy), + __imm_addr(amap) : __clobber_all ); } -- cgit v1.2.3 From 1a8fa7faf4890d201aad4f5d4943f74d840cd0ba Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 30 Dec 2025 17:25:57 -0800 Subject: resolve_btfids: Implement --patch_btfids Recent changes in BTF generation [1] rely on ${OBJCOPY} command to update .BTF_ids section data in target ELF files. This exposed a bug in llvm-objcopy --update-section code path, that may lead to corruption of a target ELF file. Specifically, because of the bug st_shndx of some symbols may be (incorrectly) set to 0xffff (SHN_XINDEX) [2][3]. While there is a pending fix for LLVM, it'll take some time before it lands (likely in 22.x). And the kernel build must keep working with older LLVM toolchains in the foreseeable future. Using GNU objcopy for .BTF_ids update would work, but it would require changes to LLVM-based build process, likely breaking existing build environments as discussed in [2]. To work around llvm-objcopy bug, implement --patch_btfids code path in resolve_btfids as a drop-in replacement for: ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${elf} Which works specifically for .BTF_ids section: ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${elf} This feature in resolve_btfids can be removed at some point in the future, when llvm-objcopy with a relevant bugfix becomes common. [1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/ [2] https://lore.kernel.org/bpf/20251224005752.201911-1-ihor.solodrai@linux.dev/ [3] https://github.com/llvm/llvm-project/issues/168060#issuecomment-3533552952 Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20251231012558.1699758-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- scripts/gen-btf.sh | 2 +- scripts/link-vmlinux.sh | 2 +- tools/bpf/resolve_btfids/main.c | 117 +++++++++++++++++++++++++++++++++++ tools/testing/selftests/bpf/Makefile | 2 +- 4 files changed, 120 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index 12244dbe097c..0aec86615416 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -123,7 +123,7 @@ embed_btf_data() fi local btf_ids="${ELF_FILE}.BTF_ids" if [ -f "${btf_ids}" ]; then - ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE} + ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${ELF_FILE} fi } diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index e2207e612ac3..1915adf3249b 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -266,7 +266,7 @@ vmlinux_link "${VMLINUX}" if is_enabled CONFIG_DEBUG_INFO_BTF; then info OBJCOPY ${btfids_vmlinux} - ${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX} + ${RESOLVE_BTFIDS} --patch_btfids ${btfids_vmlinux} ${VMLINUX} fi mksysmap "${VMLINUX}" System.map diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 2cbc252259be..df39982f51df 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -862,8 +862,119 @@ static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, cons return 0; } +/* + * Patch the .BTF_ids section of an ELF file with data from provided file. + * Equivalent to: objcopy --update-section .BTF_ids= + * + * 1. Find .BTF_ids section in the ELF + * 2. Verify that blob file size matches section size + * 3. Update section data buffer with blob data + * 4. Write the ELF file + */ +static int patch_btfids(const char *btfids_path, const char *elf_path) +{ + Elf_Scn *scn = NULL; + FILE *btfids_file; + size_t shdrstrndx; + int fd, err = -1; + Elf_Data *data; + struct stat st; + GElf_Shdr sh; + char *name; + Elf *elf; + + elf_version(EV_CURRENT); + + fd = open(elf_path, O_RDWR, 0666); + if (fd < 0) { + pr_err("FAILED to open %s: %s\n", elf_path, strerror(errno)); + return -1; + } + + elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + if (!elf) { + close(fd); + pr_err("FAILED cannot create ELF descriptor: %s\n", elf_errmsg(-1)); + return -1; + } + + elf_flagelf(elf, ELF_C_SET, ELF_F_LAYOUT); + + if (elf_getshdrstrndx(elf, &shdrstrndx) != 0) { + pr_err("FAILED cannot get shdr str ndx\n"); + goto out; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + + if (gelf_getshdr(scn, &sh) != &sh) { + pr_err("FAILED to get section header\n"); + goto out; + } + + name = elf_strptr(elf, shdrstrndx, sh.sh_name); + if (!name) + continue; + + if (strcmp(name, BTF_IDS_SECTION) == 0) + break; + } + + if (!scn) { + pr_err("FAILED: section %s not found in %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + data = elf_getdata(scn, NULL); + if (!data) { + pr_err("FAILED to get %s section data from %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + if (stat(btfids_path, &st) < 0) { + pr_err("FAILED to stat %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + if ((size_t)st.st_size != data->d_size) { + pr_err("FAILED: size mismatch - %s section in %s is %zu bytes, %s is %zu bytes\n", + BTF_IDS_SECTION, elf_path, data->d_size, btfids_path, (size_t)st.st_size); + goto out; + } + + btfids_file = fopen(btfids_path, "rb"); + if (!btfids_file) { + pr_err("FAILED to open %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + pr_debug("Copying data from %s to %s section of %s (%zu bytes)\n", + btfids_path, BTF_IDS_SECTION, elf_path, data->d_size); + + if (fread(data->d_buf, data->d_size, 1, btfids_file) != 1) { + pr_err("FAILED to read %s\n", btfids_path); + fclose(btfids_file); + goto out; + } + fclose(btfids_file); + + elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY); + if (elf_update(elf, ELF_C_WRITE) < 0) { + pr_err("FAILED to update ELF file %s\n", elf_path); + goto out; + } + + err = 0; +out: + elf_end(elf); + close(fd); + + return err; +} + static const char * const resolve_btfids_usage[] = { "resolve_btfids [] ", + "resolve_btfids --patch_btfids <.BTF_ids file> ", NULL }; @@ -880,6 +991,7 @@ int main(int argc, const char **argv) .funcs = RB_ROOT, .sets = RB_ROOT, }; + const char *btfids_path = NULL; bool fatal_warnings = false; char out_path[PATH_MAX]; @@ -894,6 +1006,8 @@ int main(int argc, const char **argv) "turn warnings into errors"), OPT_BOOLEAN(0, "distill_base", &obj.distill_base, "distill --btf_base and emit .BTF.base section data"), + OPT_STRING(0, "patch_btfids", &btfids_path, "file", + "path to .BTF_ids section data blob to patch into ELF file"), OPT_END() }; int err = -1; @@ -905,6 +1019,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (btfids_path) + return patch_btfids(btfids_path, obj.path); + if (load_btf(&obj)) goto out; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f28a32b16ff0..9488d076c740 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -657,7 +657,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $$(if $$(TEST_NEEDS_BTFIDS), \ $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ - $(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@) + $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ -- cgit v1.2.3 From 673a55cc49dafe47defb9ad76a73987fe89e5d70 Mon Sep 17 00:00:00 2001 From: Clint George Date: Mon, 15 Dec 2025 14:17:37 +0530 Subject: kselftest/coredump: use __builtin_trap() instead of null pointer Use __builtin_trap() to truly crash the program instead of dereferencing null pointer which may be optimized by the compiler preventing the crash from occurring [] Testing: The diff between before and after of running the kselftest test of the module shows no regression on system with x86 architecture [] Error log: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1 CC stackdump_test coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] 59 | i = *(int *)NULL; | ^~~~~~~~~~~~ coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' 1 warning generated. CC coredump_socket_test coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] 59 | i = *(int *)NULL; | ^~~~~~~~~~~~ coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' 1 warning generated. CC coredump_socket_protocol_test coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] 59 | i = *(int *)NULL; | ^~~~~~~~~~~~ coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' 1 warning generated. Link: https://lore.kernel.org/r/20251215084737.7504-1-clintbgeorge@gmail.com Signed-off-by: Clint George Signed-off-by: Shuah Khan --- tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c index a6f6d5f2ae07..5c8adee63641 100644 --- a/tools/testing/selftests/coredump/coredump_test_helpers.c +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -56,7 +56,7 @@ void crashing_child(void) pthread_create(&thread, NULL, do_nothing, NULL); /* crash on purpose */ - i = *(int *)NULL; + __builtin_trap(); } int create_detached_tmpfs(void) -- cgit v1.2.3 From 0aaff7b109037c0a45def1bed7b76ffaf253f7d0 Mon Sep 17 00:00:00 2001 From: Clint George Date: Mon, 15 Dec 2025 14:19:00 +0530 Subject: kselftest/anon_inode: replace null pointers with empty arrays Make use of empty (NULL-terminated) array instead of NULL pointer to avoid compiler errors while maintaining the behavior of the function intact [] Testing: The diff between before and after of running the kselftest test of the module shows no regression on system with x86 architecture [] Error log: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1 CC devpts_pts CC file_stressor CC anon_inode_test anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull] 45 | ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0); | ^~~~ /usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL' 26 | #define NULL ((void*)0) | ^~~~~~~~~~ ../Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT' 535 | __EXPECT(expected, #expected, seen, #seen, <, 1) | ^~~~~~~~ ../Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT' 758 | __typeof__(_expected) __exp = (_expected); \ | ^~~~~~~~~ 1 warning generated. Link: https://lore.kernel.org/r/20251215084900.7590-1-clintbgeorge@gmail.com Signed-off-by: Clint George Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c index 94c6c81c2301..2c4c50500116 100644 --- a/tools/testing/selftests/filesystems/anon_inode_test.c +++ b/tools/testing/selftests/filesystems/anon_inode_test.c @@ -42,7 +42,10 @@ TEST(anon_inode_no_exec) fd_context = sys_fsopen("tmpfs", 0); ASSERT_GE(fd_context, 0); - ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0); + char *const empty_argv[] = {NULL}; + char *const empty_envp[] = {NULL}; + + ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0); ASSERT_EQ(errno, EACCES); EXPECT_EQ(close(fd_context), 0); -- cgit v1.2.3 From 3e6ad272bb8b3199bad952e7b077102af2d8df03 Mon Sep 17 00:00:00 2001 From: Clint George Date: Mon, 15 Dec 2025 14:20:22 +0530 Subject: kselftest/kublk: include message in _Static_assert for C11 compatibility Add descriptive message in the _Static_assert to comply with the C11 standard requirement to prevent compiler from throwing out error. The compiler throws an error when _Static_assert is used without a message as that is a C23 extension. [] Testing: The diff between before and after of running the kselftest test of the module shows no regression on system with x86 architecture [] Error log: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk$ make LLVM=1 W=1 CC kublk In file included from kublk.c:6: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from null.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from file_backed.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from common.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from stripe.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from fault_inject.c:11: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. make: *** [../lib.mk:225: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk/kublk] Error 1 Link: https://lore.kernel.org/r/20251215085022.7642-1-clintbgeorge@gmail.com Signed-off-by: Clint George Reviewed-by: Ming Lei Signed-off-by: Shuah Khan --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index fe42705c6d42..e5eb5f762c1c 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -217,7 +217,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, unsigned tgt_data, unsigned q_id, unsigned is_target_io) { /* we only have 7 bits to encode q_id */ - _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); + _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7"); assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); return tag | (op << 16) | (tgt_data << 24) | -- cgit v1.2.3 From c286e7e9d1f1f3d90ad11c37e896f582b02d19c4 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 31 Dec 2025 14:10:50 -0800 Subject: selftests/bpf: veristat: fix printing order in output_stats() The order of the variables in the printf() doesn't match the text and therefore veristat prints something like this: Done. Processed 24 files, 0 programs. Skipped 62 files, 0 programs. When it should print: Done. Processed 24 files, 62 programs. Skipped 0 files, 0 programs. Fix the order of variables in the printf() call. Fixes: 518fee8bfaf2 ("selftests/bpf: make veristat skip non-BPF and failing-to-open BPF objects") Tested-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251231221052.759396-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index e962f133250c..1be1e353d40a 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -2580,7 +2580,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last if (last && fmt == RESFMT_TABLE) { output_header_underlines(); printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n", - env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped); + env.files_processed, env.progs_processed, env.files_skipped, env.progs_skipped); } } -- cgit v1.2.3 From a73fc3dcc60b6d7a2075e2fbdca64fd53600f855 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:10:58 -0800 Subject: rcu: Clean up after the SRCU-fastification of RCU Tasks Trace Now that RCU Tasks Trace has been re-implemented in terms of SRCU-fast, the ->trc_ipi_to_cpu, ->trc_blkd_cpu, ->trc_blkd_node, ->trc_holdout_list, and ->trc_reader_special task_struct fields are no longer used. In addition, the rcu_tasks_trace_qs(), rcu_tasks_trace_qs_blkd(), exit_tasks_rcu_finish_trace(), and rcu_spawn_tasks_trace_kthread(), show_rcu_tasks_trace_gp_kthread(), rcu_tasks_trace_get_gp_data(), rcu_tasks_trace_torture_stats_print(), and get_rcu_tasks_trace_gp_kthread() functions and all the other functions that they invoke are no longer used. Also, the TRC_NEED_QS and TRC_NEED_QS_CHECKED CPP macros are no longer used. Neither are the rcu_tasks_trace_lazy_ms and rcu_task_ipi_delay rcupdate module parameters and the TASKS_TRACE_RCU_READ_MB Kconfig option. This commit therefore removes all of them. [ paulmck: Apply Alexei Starovoitov feedback. ] Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: bpf@vger.kernel.org Reviewed-by: Joel Fernandes Signed-off-by: Boqun Feng --- Documentation/admin-guide/kernel-parameters.txt | 15 ---- include/linux/rcupdate.h | 31 +-------- include/linux/rcupdate_trace.h | 2 - include/linux/sched.h | 5 -- init/init_task.c | 3 - kernel/fork.c | 3 - kernel/rcu/Kconfig | 18 ----- kernel/rcu/rcu.h | 9 --- kernel/rcu/rcuscale.c | 7 -- kernel/rcu/rcutorture.c | 2 - kernel/rcu/tasks.h | 79 +--------------------- .../selftests/rcutorture/configs/rcu/TRACE01 | 1 - .../selftests/rcutorture/configs/rcu/TRACE02 | 1 - 13 files changed, 2 insertions(+), 174 deletions(-) (limited to 'tools') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..1b8e5cadbecb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6249,13 +6249,6 @@ Kernel parameters dynamically) adjusted. This parameter is intended for use in testing. - rcupdate.rcu_task_ipi_delay= [KNL] - Set time in jiffies during which RCU tasks will - avoid sending IPIs, starting with the beginning - of a given grace period. Setting a large - number avoids disturbing real-time workloads, - but lengthens grace periods. - rcupdate.rcu_task_lazy_lim= [KNL] Number of callbacks on a given CPU that will cancel laziness on that CPU. Use -1 to disable @@ -6299,14 +6292,6 @@ Kernel parameters of zero will disable batching. Batching is always disabled for synchronize_rcu_tasks(). - rcupdate.rcu_tasks_trace_lazy_ms= [KNL] - Set timeout in milliseconds RCU Tasks - Trace asynchronous callback batching for - call_rcu_tasks_trace(). A negative value - will take the default. A value of zero will - disable batching. Batching is always disabled - for synchronize_rcu_tasks_trace(). - rcupdate.rcu_self_test= [KNL] Run the RCU early boot self tests diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c5b30054cd01..bd5a420cf09a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -175,36 +175,7 @@ void rcu_tasks_torture_stats_print(char *tt, char *tf); # define synchronize_rcu_tasks synchronize_rcu # endif -# ifdef CONFIG_TASKS_TRACE_RCU -// Bits for ->trc_reader_special.b.need_qs field. -#define TRC_NEED_QS 0x1 // Task needs a quiescent state. -#define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. - -u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); -void rcu_tasks_trace_qs_blkd(struct task_struct *t); - -# define rcu_tasks_trace_qs(t) \ - do { \ - int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ - \ - if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ - likely(!___rttq_nesting)) { \ - rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ - } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ - !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ - rcu_tasks_trace_qs_blkd(t); \ - } \ - } while (0) -void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); -# else -# define rcu_tasks_trace_qs(t) do { } while (0) -# endif - -#define rcu_tasks_qs(t, preempt) \ -do { \ - rcu_tasks_classic_qs((t), (preempt)); \ - rcu_tasks_trace_qs(t); \ -} while (0) +#define rcu_tasks_qs(t, preempt) rcu_tasks_classic_qs((t), (preempt)) # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 3f46cbe67000..0bd47f12ecd1 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -136,9 +136,7 @@ static inline void rcu_barrier_tasks_trace(void) } // Placeholders to enable stepwise transition. -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); void __init rcu_tasks_trace_suppress_unused(void); -struct task_struct *get_rcu_tasks_trace_gp_kthread(void); #else /* diff --git a/include/linux/sched.h b/include/linux/sched.h index fe39d422b37d..56156643ccac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -946,11 +946,6 @@ struct task_struct { #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; struct srcu_ctr __percpu *trc_reader_scp; - int trc_ipi_to_cpu; - union rcu_special trc_reader_special; - struct list_head trc_holdout_list; - struct list_head trc_blkd_node; - int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; diff --git a/init/init_task.c b/init/init_task.c index 49b13d7c3985..db92c404d59a 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -195,9 +195,6 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, - .trc_reader_special.s = 0, - .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), - .trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node), #endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, diff --git a/kernel/fork.c b/kernel/fork.c index b1f3915d5f8e..d7ed107cbb47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1828,9 +1828,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; - p->trc_reader_special.s = 0; - INIT_LIST_HEAD(&p->trc_holdout_list); - INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 4d9b21f69eaa..8d5a1ecb7d56 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -313,24 +313,6 @@ config RCU_NOCB_CPU_CB_BOOST Say Y here if you want to set RT priority for offloading kthreads. Say N here if you are building a !PREEMPT_RT kernel and are unsure. -config TASKS_TRACE_RCU_READ_MB - bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT && TASKS_TRACE_RCU - default PREEMPT_RT || NR_CPUS < 8 - help - Use this option to further reduce the number of IPIs sent - to CPUs executing in userspace or idle during tasks trace - RCU grace periods. Given that a reasonable setting of - the rcupdate.rcu_task_ipi_delay kernel boot parameter - eliminates such IPIs for many workloads, proper setting - of this Kconfig option is important mostly for aggressive - real-time installations and for battery-powered devices, - hence the default chosen above. - - Say Y here if you hate IPIs. - Say N here if you hate read-side memory barriers. - Take the default if you are unsure. - config RCU_LAZY bool "RCU callback lazy invocation functionality" depends on RCU_NOCB_CPU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 9cf01832a6c3..dc5d614b372c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -544,10 +544,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void); void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RUDE_RCU -#ifdef CONFIG_TASKS_TRACE_RCU -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); -#endif - #ifdef CONFIG_TASKS_RCU_GENERIC void tasks_cblist_init_generic(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -673,11 +669,6 @@ void show_rcu_tasks_rude_gp_kthread(void); #else static inline void show_rcu_tasks_rude_gp_kthread(void) {} #endif -#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) -void show_rcu_tasks_trace_gp_kthread(void); -#else -static inline void show_rcu_tasks_trace_gp_kthread(void) {} -#endif #ifdef CONFIG_TINY_RCU static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 7484d8ad5767..1c50f89fbd6f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -400,11 +400,6 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } -static void rcu_tasks_trace_scale_stats(void) -{ - rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); -} - static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -416,8 +411,6 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, - .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, - .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 07e51974b06b..78a6ebe77d35 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1180,8 +1180,6 @@ static struct rcu_torture_ops tasks_tracing_ops = { .exp_sync = synchronize_rcu_tasks_trace, .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, - .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, - .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .slow_gps = 1, diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1fe789c99f36..1249b47f0a8d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif -/* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) -static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; -module_param(rcu_task_ipi_delay, int, 0644); - /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -800,8 +795,6 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t #endif // #ifndef CONFIG_TINY_RCU -static void exit_tasks_rcu_finish_trace(struct task_struct *t); - #if defined(CONFIG_TASKS_RCU) //////////////////////////////////////////////////////////////////////// @@ -1321,13 +1314,11 @@ void exit_tasks_rcu_finish(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - - exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -1475,69 +1466,6 @@ void __init rcu_tasks_trace_suppress_unused(void) #endif // #ifndef CONFIG_TINY_RCU } -/* - * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for - * the four-byte operand-size restriction of some platforms. - * - * Returns the old value, which is often ignored. - */ -u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) -{ - return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); -} -EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); - -/* Add a newly blocked reader task to its CPU's list. */ -void rcu_tasks_trace_qs_blkd(struct task_struct *t) -{ -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); - -/* Communicate task state back to the RCU tasks trace stall warning request. */ -struct trc_stall_chk_rdr { - int nesting; - int ipi_to_cpu; - u8 needqs; -}; - -/* Report any needed quiescent state for this exiting task. */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) -{ -} - -int rcu_tasks_trace_lazy_ms = -1; -module_param(rcu_tasks_trace_lazy_ms, int, 0444); - -static int __init rcu_spawn_tasks_trace_kthread(void) -{ - return 0; -} - -#if !defined(CONFIG_TINY_RCU) -void show_rcu_tasks_trace_gp_kthread(void) -{ -} -EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) -{ -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); -#endif // !defined(CONFIG_TINY_RCU) - -struct task_struct *get_rcu_tasks_trace_gp_kthread(void) -{ - return NULL; -} -EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) -{ -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); - -#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ #ifndef CONFIG_TINY_RCU @@ -1545,7 +1473,6 @@ void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); - show_rcu_tasks_trace_gp_kthread(); } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -1684,10 +1611,6 @@ static int __init rcu_init_tasks_generic(void) rcu_spawn_tasks_rude_kthread(); #endif -#ifdef CONFIG_TASKS_TRACE_RCU - rcu_spawn_tasks_trace_kthread(); -#endif - // Run the self-tests. rcu_tasks_initiate_self_tests(); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index 85b407467454..18efab346381 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -10,5 +10,4 @@ CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y -CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 9003c56cd764..8da390e82829 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -9,6 +9,5 @@ CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y -CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y CONFIG_DEBUG_OBJECTS=y -- cgit v1.2.3 From 3ce40539cc0055b2df49303979c5b6a4a8321be4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:54 -0800 Subject: torture: Parallelize kvm-series.sh guest-OS execution Currently, kvm-series.sh builds and runs serially, which makes for long execution times. This commit changes its logic to build all of the needed kernels serially, but then run the corresponding guest OSes concurrently in batches using the entire machine. On large systems, this results in order-of-magnitude speedups of the guest-OS execution portion of the runtime. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- .../testing/selftests/rcutorture/bin/kvm-series.sh | 174 ++++++++++++++++++--- 1 file changed, 153 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index 2ff905a1853b..6729687861f2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -15,7 +15,7 @@ # This script is intended to replace kvm-check-branches.sh by providing # ease of use and faster execution. -T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`" +T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"; export T trap 'rm -rf $T' 0 scriptname=$0 @@ -53,40 +53,62 @@ shift RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE PATH=${RCUTORTURE}/bin:$PATH; export PATH +RES="${RCUTORTURE}/res"; export RES . functions.sh ret=0 -nfail=0 +nbuildfail=0 +nrunfail=0 nsuccess=0 -faillist= +ncpus=0 +buildfaillist= +runfaillist= successlist= cursha1="`git rev-parse --abbrev-ref HEAD`" ds="`date +%Y.%m.%d-%H.%M.%S`-series" +DS="${RES}/${ds}"; export DS startdate="`date`" starttime="`get_starttime`" echo " --- " $scriptname $args | tee -a $T/log echo " --- Results directory: " $ds | tee -a $T/log +# Do all builds. Iterate through commits within a given scenario +# because builds normally go faster from one commit to the next within a +# given scenario. In contrast, switching scenarios on each rebuild will +# often force a full rebuild due to Kconfig differences, for example, +# turning preemption on and off. Defer actual runs in order to run +# lots of them concurrently on large systems. +touch $T/torunlist for config in ${config_list} do sha_n=0 for sha in ${sha1_list} do sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order. + echo echo Starting ${config}/${sha1} at `date` | tee -a $T/log - git checkout "${sha}" - time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@" + git checkout --detach "${sha}" + tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@" curret=$? if test "${curret}" -ne 0 then - nfail=$((nfail+1)) - faillist="$faillist ${config}/${sha1}(${curret})" + nbuildfail=$((nbuildfail+1)) + buildfaillist="$buildfaillist ${config}/${sha1}(${curret})" else - nsuccess=$((nsuccess+1)) - successlist="$successlist ${config}/${sha1}" - # Successful run, so remove large files. - rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers} + batchncpus="`grep -v "^# cpus=" "${DS}/${config}/${sha1}/batches" | awk '{ sum += $3 } END { print sum }'`" + echo run_one_qemu ${sha_n} ${config}/${sha1} ${batchncpus} >> $T/torunlist + if test "${ncpus}" -eq 0 + then + ncpus="`grep "^# cpus=" "${DS}/${config}/${sha1}/batches" | sed -e 's/^# cpus=//'`" + case "${ncpus}" in + ^[0-9]*$) + ;; + *) + ncpus=0 + ;; + esac + fi fi if test "${ret}" -eq 0 then @@ -95,22 +117,132 @@ do sha_n=$((sha_n+1)) done done + +# If the user did not specify the number of CPUs, use them all. +if test "${ncpus}" -eq 0 +then + ncpus="`identify_qemu_vcpus`" +fi + +cpusused=0 +touch $T/successlistfile +touch $T/faillistfile + +# do_run_one_qemu ds resultsdir qemu_curout +# +# Start the specified qemu run and record its success or failure. +do_run_one_qemu () { + local ret + local ds="$1" + local resultsdir="$2" + local qemu_curout="$3" + + tools/testing/selftests/rcutorture/bin/kvm-again.sh "${DS}/${resultsdir}" --link inplace-force > ${qemu_curout} 2>&1 + ret=$? + if test "${ret}" -eq 0 + then + echo ${resultsdir} >> $T/successlistfile + # Successful run, so remove large files. + rm -f ${DS}/${resultsdir}/{vmlinux,bzImage,System.map,Module.symvers} + else + echo "${resultsdir}(${ret})" >> $T/faillistfile + fi +} + +# cleanup_qemu_batch batchncpus +# +# Update success and failure lists, files, and counts at the end of +# a batch. +cleanup_qemu_batch () { + local batchncpus="$1" + + echo Waiting, cpusused=${cpusused}, ncpus=${ncpus} `date` | tee -a $T/log + wait + cpusused="${batchncpus}" + nsuccessbatch="`wc -l $T/successlistfile | awk '{ print $1 }'`" + nsuccess=$((nsuccess+nsuccessbatch)) + successlist="$successlist `cat $T/successlistfile`" + rm $T/successlistfile + touch $T/successlistfile + nfailbatch="`wc -l $T/faillistfile | awk '{ print $1 }'`" + nrunfail=$((nrunfail+nfailbatch)) + runfaillist="$runfaillist `cat $T/faillistfile`" + rm $T/faillistfile + touch $T/faillistfile +} + +# run_one_qemu sha_n config/sha1 batchncpus +# +# Launch into the background the sha_n-th qemu job whose results directory +# is config/sha1 and which uses batchncpus CPUs. Once we reach a job that +# would overflow the number of available CPUs, wait for the previous jobs +# to complete and record their results. +run_one_qemu () { + local sha_n="$1" + local config_sha1="$2" + local batchncpus="$3" + local qemu_curout + + cpusused=$((cpusused+batchncpus)) + if test "${cpusused}" -gt $ncpus + then + cleanup_qemu_batch "${batchncpus}" + fi + echo Starting ${config_sha1} using ${batchncpus} CPUs `date` + qemu_curout="${DS}/${config_sha1}/qemu-series" + do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} & +} + +# Re-ordering the runs will mess up the affinity chosen at build time +# (among other things, over-using CPU 0), so suppress it. +TORTURE_NO_AFFINITY="no-affinity"; export TORTURE_NO_AFFINITY + +# Run the kernels (if any) that built correctly. +echo | tee -a $T/log # Put a blank line between build and run messages. +. $T/torunlist +cleanup_qemu_batch "${batchncpus}" + +# Get back to initial checkout/SHA-1. git checkout "${cursha1}" -echo ${nsuccess} SUCCESSES: | tee -a $T/log -echo ${successlist} | fmt | tee -a $T/log -echo | tee -a $T/log -echo ${nfail} FAILURES: | tee -a $T/log -echo ${faillist} | fmt | tee -a $T/log -if test -n "${faillist}" +# Throw away leading and trailing space characters for fmt. +successlist="`echo ${successlist} | sed -e 's/^ *//' -e 's/ *$//'`" +buildfaillist="`echo ${buildfaillist} | sed -e 's/^ *//' -e 's/ *$//'`" +runfaillist="`echo ${runfaillist} | sed -e 's/^ *//' -e 's/ *$//'`" + +# Print lists of successes, build failures, and run failures, if any. +if test "${nsuccess}" -gt 0 +then + echo | tee -a $T/log + echo ${nsuccess} SUCCESSES: | tee -a $T/log + echo ${successlist} | fmt | tee -a $T/log +fi +if test "${nbuildfail}" -gt 0 then echo | tee -a $T/log - echo Failures across commits: | tee -a $T/log - echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | + echo ${nbuildfail} BUILD FAILURES: | tee -a $T/log + echo ${buildfaillist} | fmt | tee -a $T/log +fi +if test "${nrunfail}" -gt 0 +then + echo | tee -a $T/log + echo ${nrunfail} RUN FAILURES: | tee -a $T/log + echo ${runfaillist} | fmt | tee -a $T/log +fi + +# If there were build or runtime failures, map them to commits. +if test "${nbuildfail}" -gt 0 || test "${nrunfail}" -gt 0 +then + echo | tee -a $T/log + echo Build failures across commits: | tee -a $T/log + echo ${buildfaillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | sort | uniq -c | sort -k2n | tee -a $T/log fi + +# Print run summary. +echo | tee -a $T/log echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log -echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log -cp $T/log tools/testing/selftests/rcutorture/res/${ds} +echo Summary: Successes: ${nsuccess} " "Build Failures: ${nbuildfail} " "Runtime Failures: ${nrunfail}| tee -a $T/log +cp $T/log ${DS} exit "${ret}" -- cgit v1.2.3 From 672621773f7df8cda2ff5635edac4aa5339d097f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:55 -0800 Subject: torture: Make kvm-series.sh give build numbers and totals The kvm-series.sh script can easily be convinced to do on the order of 1,000 builds, so some sort of progress indicator would be helpful. This commit therefore updates the "Starting" output lines to read as in the following example, adding the ("2 of 4"): Starting TREE01/1.7e0ad1b49057 (2 of 4) at Sat Nov 8 10:08:21 PM PST 2025 Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm-series.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index 6729687861f2..a00d2e96f6cc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -32,6 +32,7 @@ then echo "$0: Repetition ('*') not allowed in config list." exit 1 fi +config_list_len="`echo ${config_list} | wc -w | awk '{ print $1; }'`" commit_list="${2}" if test -z "${commit_list}" @@ -47,6 +48,7 @@ then exit 2 fi sha1_list=`cat $T/commits` +sha1_list_len="`echo ${sha1_list} | wc -w | awk '{ print $1; }'`" shift shift @@ -80,6 +82,8 @@ echo " --- Results directory: " $ds | tee -a $T/log # turning preemption on and off. Defer actual runs in order to run # lots of them concurrently on large systems. touch $T/torunlist +n2build="$((config_list_len*sha1_list_len))" +nbuilt=0 for config in ${config_list} do sha_n=0 @@ -87,7 +91,7 @@ do do sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order. echo - echo Starting ${config}/${sha1} at `date` | tee -a $T/log + echo Starting ${config}/${sha1} "($((nbuilt+1)) of ${n2build})" at `date` | tee -a $T/log git checkout --detach "${sha}" tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@" curret=$? @@ -115,6 +119,7 @@ do ret=${curret} fi sha_n=$((sha_n+1)) + nbuilt=$((nbuilt+1)) done done -- cgit v1.2.3 From 3d69b6beb8ba932911096138394b5cd3e21b3b92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:56 -0800 Subject: torture: Make kvm-series.sh give run numbers and totals The kvm-series.sh script can easily be convinced to run on the order of 1,000 guest OSes, so some sort of progress indicator would be helpful. This commit therefore updates the "Starting" output lines to read as in the following example, adding the ("3 of 4"): Starting TREE02/1.7e0ad1b49057 using 8 CPUs (4 of 4) Sat Nov 8 10:51:06 PM PST 2025 Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm-series.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index a00d2e96f6cc..c4ee5f910931 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -132,6 +132,8 @@ fi cpusused=0 touch $T/successlistfile touch $T/faillistfile +n2run="`wc -l $T/torunlist | awk '{ print $1; }'`" +nrun=0 # do_run_one_qemu ds resultsdir qemu_curout # @@ -193,9 +195,10 @@ run_one_qemu () { then cleanup_qemu_batch "${batchncpus}" fi - echo Starting ${config_sha1} using ${batchncpus} CPUs `date` + echo Starting ${config_sha1} using ${batchncpus} CPUs "($((nrun+1)) of ${n2run})" `date` qemu_curout="${DS}/${config_sha1}/qemu-series" do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} & + nrun="$((nrun+1))" } # Re-ordering the runs will mess up the affinity chosen at build time -- cgit v1.2.3 From dcd6067322ba6750995fbc5486d7c9ada88489ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:57 -0800 Subject: torture: Make config2csv.sh properly handle comments in .boot files As in strip the "#" and everything after it and *then* tokenize. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/config2csv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/config2csv.sh b/tools/testing/selftests/rcutorture/bin/config2csv.sh index 0cf55f1bf654..aeab4d6f11ad 100755 --- a/tools/testing/selftests/rcutorture/bin/config2csv.sh +++ b/tools/testing/selftests/rcutorture/bin/config2csv.sh @@ -42,7 +42,7 @@ do grep -v '^#' < $i | grep -v '^ *$' > $T/p if test -r $i.boot then - tr -s ' ' '\012' < $i.boot | grep -v '^#' >> $T/p + sed -e 's/#.*$//' < $i.boot | tr -s ' ' '\012' >> $T/p fi sed -e 's/^[^=]*$/&=?/' < $T/p | sed -e 's/^\([^=]*\)=\(.*\)$/\tp["\1:'"$i"'"] = "\2";\n\tc["\1"] = 1;/' >> $T/p.awk -- cgit v1.2.3 From c89474b9b2ab8ab2c0d2cddadbed781c0f5e8f0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:58 -0800 Subject: torture: Include commit discription in testid.txt Currently, the testid.txt file in the top-level directory of the rcutorture results contains the output of "git rev-parse HEAD", which just gives the full SHA-1 of the current commit. This is followed by the output of "git status", which is further followed by the output of "git diff". This works, but is less than helpful to human readers scanning a list of commits. This commit therefore instead uses "git show --oneline --no-patch HEAD", which provides a short SHA-1, but also the names of any branches and the commit's title. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/mktestid.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/mktestid.sh b/tools/testing/selftests/rcutorture/bin/mktestid.sh index 16f9907a4dae..24f6261dab6a 100755 --- a/tools/testing/selftests/rcutorture/bin/mktestid.sh +++ b/tools/testing/selftests/rcutorture/bin/mktestid.sh @@ -18,7 +18,7 @@ fi echo Build directory: `pwd` > ${resdir}/testid.txt if test -d .git then - echo Current commit: `git rev-parse HEAD` >> ${resdir}/testid.txt + echo Current commit: `git show --oneline --no-patch HEAD` >> ${resdir}/testid.txt echo >> ${resdir}/testid.txt echo ' ---' Output of "'"git status"'": >> ${resdir}/testid.txt git status >> ${resdir}/testid.txt -- cgit v1.2.3 From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:28 -0800 Subject: bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the flag itself. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 13 ++++++------- fs/verity/measure.c | 2 +- include/linux/bpf.h | 2 +- include/linux/btf.h | 3 +-- kernel/bpf/arena.c | 6 +++--- kernel/bpf/cpumask.c | 2 +- kernel/bpf/helpers.c | 20 ++++++++++---------- kernel/bpf/map_iter.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/sched/ext.c | 8 ++++---- mm/bpf_memcontrol.c | 10 +++++----- net/core/filter.c | 10 +++++----- net/core/xdp.c | 2 +- net/netfilter/nf_conntrack_bpf.c | 8 ++++---- net/netfilter/nf_flow_table_bpf.c | 2 +- net/netfilter/nf_nat_bpf.c | 2 +- net/sched/bpf_qdisc.c | 12 ++++++------ tools/testing/selftests/bpf/progs/cpumask_failure.c | 2 +- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 20 ++++++++++---------- 19 files changed, 63 insertions(+), 65 deletions(-) (limited to 'tools') diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index abd5eaa4892e..e4e51a1d0de2 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -356,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_task_exe_file, - KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_path_d_path) +BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 388734132f01..6a35623ebdf0 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di __bpf_kfunc_end_defs(); BTF_KFUNCS_START(fsverity_set_ids) -BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest) BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e7d72dfbcd4..9efb2ddf331c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -753,7 +753,7 @@ enum bpf_type_flag { MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be - * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * passed to kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is diff --git a/include/linux/btf.h b/include/linux/btf.h index f06976ffb63f..691f09784933 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -34,7 +34,7 @@ * * And the following kfunc: * - * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE) * * All invocations to the kfunc must pass the unmodified, unwalked task: * @@ -66,7 +66,6 @@ * return 0; * } */ -#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 456ac989269d..2274319a95e6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -890,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9876c5fe6c2a..b8c805b4b06a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -477,7 +477,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c..2c15f77c74db 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4427,7 +4427,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS -BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -4467,14 +4467,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) @@ -4510,8 +4510,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) @@ -4536,10 +4536,10 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 9575314f40a6..261a03ea73d3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count) BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 359a962d69a1..c9da70dd3e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12619,7 +12619,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 94164f2dec6d..fd5423428dde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7229,9 +7229,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr) +BTF_ID_FLAGS(func, scx_bpf_error_bstr) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) @@ -7250,7 +7250,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) -BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_events) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index e8fa7f5855f9..716df49d7647 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -166,11 +166,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_memcontrol_kfuncs) diff --git a/net/core/filter.c b/net/core/filter.c index 616e0520a0bb..d43df98e1ded 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12438,11 +12438,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12455,11 +12455,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) -BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) -BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { @@ -12554,7 +12554,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_destroy) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..fee6d080ee85 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..a630139bd0c3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -516,10 +516,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_timeout) +BTF_ID_FLAGS(func, bpf_ct_change_timeout) +BTF_ID_FLAGS(func, bpf_ct_set_status) +BTF_ID_FLAGS(func, bpf_ct_change_status) BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c index 4a5f5195f2d2..cbd5b97a6329 100644 --- a/net/netfilter/nf_flow_table_bpf.c +++ b/net/netfilter/nf_flow_table_bpf.c @@ -105,7 +105,7 @@ __diag_pop() __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_ft_kfunc_set) -BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL) BTF_KFUNCS_END(nf_ft_kfunc_set) static const struct btf_kfunc_id_set nf_flow_kfunc_set = { diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 481be15609b1..f9dd85ccea01 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_nat_kfunc_set) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info) BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..b9771788b9b3 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -271,14 +271,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) -BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_skb_get_hash) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 8a2fd596c8a3..61c32e91e8c3 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { - /* NULL passed to KF_TRUSTED_ARGS kfunc. */ + /* NULL passed to kfunc. */ bpf_cpumask_empty(NULL); return 0; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 90c4b1a51de6..1c41d03bd5a1 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -693,9 +693,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) @@ -1158,7 +1158,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) @@ -1172,12 +1172,12 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) -- cgit v1.2.3 From df5004579bbdfe4c734f2cbbf3f44fe3fac440a3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:32 -0800 Subject: selftests: bpf: Update kfunc_param_nullable test for new error message With trusted args now being the default, the NULL pointer check runs before type-specific validation. Update test3 to expect the new error message "Possibly NULL pointer passed to trusted arg0" instead of the old dynptr-specific error message. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-7-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 0ad1bf1ede8d..967081bbcfe1 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("expected pointer to stack or const struct bpf_dynptr") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; -- cgit v1.2.3 From 03cc77b10e009ce87f1a8e93454aadf2912a4c15 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:33 -0800 Subject: selftests: bpf: Update failure message for rbtree_fail The rbtree_api_use_unchecked_remove_retval() selftest passes a pointer received from bpf_rbtree_remove() to bpf_rbtree_add() without checking for NULL, this was earlier caught by __check_ptr_off_reg() in the verifier. Now the verifier assumes every kfunc only takes trusted pointer arguments, so it catches this NULL pointer earlier in the path and provides a more accurate failure message. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-8-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/rbtree_fail.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 4acb6af2dfe3..70b7baf9304b 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("dereference of modified ptr_or_null_ ptr R2 off=16 disallowed") +__failure __msg("Possibly NULL pointer passed to trusted arg1") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; -- cgit v1.2.3 From 230b0118e416583a53fc0ad5d1fecb37f496fe34 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:34 -0800 Subject: selftests: bpf: fix test_kfunc_dynptr_param As verifier now assumes that all kfuncs only takes trusted pointer arguments, passing 0 (NULL) to a kfunc that doesn't mark the argument as __nullable or __opt will be rejected with a failure message of: Possibly NULL pointer passed to trusted arg Pass a non-null value to the kfunc to test the expected failure mode. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-9-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index 061befb004c2..d249113ed657 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -48,10 +48,9 @@ SEC("?lsm.s/bpf") __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - unsigned long val = 0; + static struct bpf_dynptr val; - return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val, - (struct bpf_dynptr *)val, NULL); + return bpf_verify_pkcs7_signature(&val, &val, NULL); } SEC("lsm.s/bpf") -- cgit v1.2.3 From cf82580c86a91de2aa979260985cadcb39ed28d2 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:35 -0800 Subject: selftests: bpf: fix cgroup_hierarchical_stats The cgroup_hierarchical_stats selftests uses an fentry program attached to cgroup_attach_task and then passes the received &dst_cgrp->self to the css_rstat_updated() kfunc. The verifier now assumes that all kfuncs only takes trusted pointer arguments, and pointers received by fentry are not marked trustes by default. Use a tp_btf program in place for fentry for this test, pointers received by tp_btf programs are marked trusted by the verifier. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-10-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index ff189a736ad8..8fc38592a87b 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -62,9 +62,9 @@ static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending) &init, BPF_NOEXIST); } -SEC("fentry/cgroup_attach_task") -int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup) +SEC("tp_btf/cgroup_attach_task") +int BPF_PROG(counter, struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup) { __u64 cg_id = cgroup_id(dst_cgrp); struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem( -- cgit v1.2.3 From cf503eb2c6c38bf449063f33790a96218a067718 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:36 -0800 Subject: selftests: bpf: Fix test_bpf_nf for trusted args becoming default With trusted args now being the default, passing NULL to kfunc parameters that are pointers causes verifier rejection rather than a runtime error. The test_bpf_nf test was failing because it attempted to pass NULL to bpf_xdp_ct_lookup() to verify runtime error handling. Since the NULL check now happens at verification time, remove the runtime test case that passed NULL to the bpf_tuple parameter and instead add verification-time tests to ensure the verifier correctly rejects programs that pass NULL to trusted arguments. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-11-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 5 +- tools/testing/selftests/bpf/progs/test_bpf_nf.c | 7 --- .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 57 ++++++++++++++++++++++ 3 files changed, 61 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index dd6512fa652b..215878ea04de 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -19,6 +19,10 @@ struct { { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, }; enum { @@ -111,7 +115,6 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK(err, "bpf_prog_test_run")) goto end; - ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple"); ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0"); ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0"); ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1"); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index f7b330ddd007..076fbf03a126 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -15,7 +15,6 @@ extern unsigned long CONFIG_HZ __kconfig; -int test_einval_bpf_tuple = 0; int test_einval_reserved = 0; int test_einval_reserved_new = 0; int test_einval_netns_id = 0; @@ -99,12 +98,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, __builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4)); - ct = lookup_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def)); - if (ct) - bpf_ct_release(ct); - else - test_einval_bpf_tuple = opts_def.error; - opts_def.reserved[0] = 1; ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index a586f087ffeb..2c156cd166af 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -4,6 +4,7 @@ #include #include #include +#include "bpf_misc.h" struct nf_conn; @@ -18,6 +19,10 @@ struct nf_conn *bpf_skb_ct_alloc(struct __sk_buff *, struct bpf_sock_tuple *, u3 struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32, struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_alloc(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_ct_insert_entry(struct nf_conn *) __ksym; void bpf_ct_release(struct nf_conn *) __ksym; void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym; @@ -146,4 +151,56 @@ int change_status_after_alloc(struct __sk_buff *ctx) return 0; } +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int lookup_null_bpf_tuple(struct __sk_buff *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int lookup_null_bpf_opts(struct __sk_buff *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From ec4bb8e8dfa060c699b548f62e4d56133aafbbec Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 24 Sep 2025 16:20:58 +0200 Subject: tools/nolibc: add ptrace support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ptrace support, as it will be useful in UML. Signed-off-by: Benjamin Berg [Thomas: drop va_args usage and linux/uio.h inclusion] Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/Makefile | 1 + tools/include/nolibc/nolibc.h | 1 + tools/include/nolibc/sys/ptrace.h | 33 ++++++++++++++++++++++++++++ tools/testing/selftests/nolibc/nolibc-test.c | 2 ++ 4 files changed, 37 insertions(+) create mode 100644 tools/include/nolibc/sys/ptrace.h (limited to 'tools') diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 8118e22844f1..8b883a6fe580 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -54,6 +54,7 @@ all_files := \ sys/mman.h \ sys/mount.h \ sys/prctl.h \ + sys/ptrace.h \ sys/random.h \ sys/reboot.h \ sys/resource.h \ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 272dfc961158..9c7f43b9218b 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -101,6 +101,7 @@ #include "sys/mman.h" #include "sys/mount.h" #include "sys/prctl.h" +#include "sys/ptrace.h" #include "sys/random.h" #include "sys/reboot.h" #include "sys/resource.h" diff --git a/tools/include/nolibc/sys/ptrace.h b/tools/include/nolibc/sys/ptrace.h new file mode 100644 index 000000000000..72ca28541633 --- /dev/null +++ b/tools/include/nolibc/sys/ptrace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ptrace for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + * Copyright (C) 2025 Intel Corporation + */ + +/* make sure to include all global symbols */ +#include "../nolibc.h" + +#ifndef _NOLIBC_SYS_PTRACE_H +#define _NOLIBC_SYS_PTRACE_H + +#include "../sys.h" + +#include + +/* + * long ptrace(int op, pid_t pid, void *addr, void *data); + */ +static __attribute__((unused)) +long sys_ptrace(int op, pid_t pid, void *addr, void *data) +{ + return my_syscall4(__NR_ptrace, op, pid, addr, data); +} + +static __attribute__((unused)) +ssize_t ptrace(int op, pid_t pid, void *addr, void *data) +{ + return __sysret(sys_ptrace(op, pid, addr, data)); +} + +#endif /* _NOLIBC_SYS_PTRACE_H */ diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 3c5a226dad3a..6888b20af259 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1406,6 +1407,7 @@ int run_syscall(int min, int max) CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break; CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break; CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break; + CASE_TEST(ptrace); EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break; CASE_TEST(syscall_noargs); EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break; CASE_TEST(syscall_args); EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break; CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break; -- cgit v1.2.3 From cc6809f6728456c03db6750fcc94ed8b581a2cf8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 3 Dec 2025 20:08:00 +0100 Subject: tools/nolibc: always use 64-bit mode for s390 header checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 32-bit s390 support was recently removed from nolibc. If the compiler defaults to 32-bit during the header checks, they fail. Make sure to always use 64-bit mode for s390 heafer checks. Fixes: 169ebcbb9082 ("tools: Remove s390 compat support") Acked-by: Willy Tarreau Acked-by: Heiko Carstens Link: https://patch.msgid.link/20251203-nolibc-headers-check-s390-v1-1-5d35e52a83ba@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 8b883a6fe580..1958dda98895 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -104,9 +104,12 @@ headers_standalone: headers $(Q)$(MAKE) -C $(srctree) headers $(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)sysroot +CFLAGS_s390 := -m64 +CFLAGS := $(CFLAGS_$(ARCH)) + headers_check: headers_standalone $(Q)for header in $(filter-out crt.h std.h,$(all_files)); do \ - $(CC) $(CLANG_CROSS_FLAGS) -Wall -Werror -nostdinc -fsyntax-only -x c /dev/null \ + $(CC) $(CFLAGS) $(CLANG_CROSS_FLAGS) -Wall -Werror -nostdinc -fsyntax-only -x c /dev/null \ -I$(or $(objtree),$(srctree))/usr/include -include $$header -include $$header || exit 1; \ done -- cgit v1.2.3 From f675e35dd28f1ac326b1d6520fee3605019b381b Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:45 +0100 Subject: tools/nolibc/poll: use kernel types for system call invocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The system calls expect 'struct __kernel_old_timespec'. While currently 'struct __kernel_old_timespec' and 'struct timespec' are compatible, this is confusing. Especially as future patches will change the definition of 'struct timespec'. Use the correct kernel type instead. Suggested-by: Arnd Bergmann Link: https://lore.kernel.org/lkml/fbca1d3e-12e4-4c4e-8091-87464035fe39@app.fastmail.com/ Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-1-c662992f75d7@weissschuh.net --- tools/include/nolibc/poll.h | 2 +- tools/include/nolibc/sys/select.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h index 0d053f93ea99..df952bcf0905 100644 --- a/tools/include/nolibc/poll.h +++ b/tools/include/nolibc/poll.h @@ -24,7 +24,7 @@ static __attribute__((unused)) int sys_poll(struct pollfd *fds, int nfds, int timeout) { #if defined(__NR_ppoll) - struct timespec t; + struct __kernel_old_timespec t; if (timeout >= 0) { t.tv_sec = timeout / 1000; diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h index 2a5619c01277..9a29e5b98a3c 100644 --- a/tools/include/nolibc/sys/select.h +++ b/tools/include/nolibc/sys/select.h @@ -75,7 +75,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva #elif defined(__NR_select) return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout); #elif defined(__NR_pselect6) - struct timespec t; + struct __kernel_old_timespec t; if (timeout) { t.tv_sec = timeout->tv_sec; -- cgit v1.2.3 From 548d682649f02f4240e8ea4e99f1899978e1cfe4 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:46 +0100 Subject: tools/nolibc/poll: drop __NR_poll fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fallback is never used, remove it. Suggested-by: Arnd Bergmann Link: https://lore.kernel.org/lkml/fbca1d3e-12e4-4c4e-8091-87464035fe39@app.fastmail.com/ Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-2-c662992f75d7@weissschuh.net --- tools/include/nolibc/poll.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h index df952bcf0905..5b4fa339fbb5 100644 --- a/tools/include/nolibc/poll.h +++ b/tools/include/nolibc/poll.h @@ -31,7 +31,7 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout) t.tv_nsec = (timeout % 1000) * 1000000; } return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); -#elif defined(__NR_ppoll_time64) +#else struct __kernel_timespec t; if (timeout >= 0) { @@ -39,8 +39,6 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout) t.tv_nsec = (timeout % 1000) * 1000000; } return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); -#else - return my_syscall3(__NR_poll, fds, nfds, timeout); #endif } -- cgit v1.2.3 From 668e43737279284318064bdd4eab689a3aaed652 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:47 +0100 Subject: tools/nolibc/select: drop non-pselect based implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These implementations use the libc 'struct timeval' with system calls which can lead to type mismatches. Currently this is fine, but will break with upcoming changes to 'struct timeval'. If the structure needs to be converted anyways, the implementations based on pselect can be used for all architectures. This simplifies the logic. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-3-c662992f75d7@weissschuh.net --- tools/include/nolibc/sys/select.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h index 9a29e5b98a3c..50b77dace7ef 100644 --- a/tools/include/nolibc/sys/select.h +++ b/tools/include/nolibc/sys/select.h @@ -63,18 +63,7 @@ typedef struct { static __attribute__((unused)) int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) { -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__NR__newselect) - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#elif defined(__NR_select) - return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout); -#elif defined(__NR_pselect6) +#if defined(__NR_pselect6) struct __kernel_old_timespec t; if (timeout) { -- cgit v1.2.3 From b8f4f5d1b99e2ae73fd448e9bbd16dc244e6586c Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:48 +0100 Subject: tools/nolibc/time: drop invocation of gettimeofday system call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This invocation uses libc types with a system call. While this works now, upcoming changes to 'struct timeval' would require type conversions. If types are converted anyways, the clock_gettime() based fallback can be used everywhere, simplifying the code. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-4-c662992f75d7@weissschuh.net --- tools/include/nolibc/sys/time.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys/time.h b/tools/include/nolibc/sys/time.h index 33782a19aae9..171187836e6d 100644 --- a/tools/include/nolibc/sys/time.h +++ b/tools/include/nolibc/sys/time.h @@ -22,9 +22,6 @@ static int sys_clock_gettime(clockid_t clockid, struct timespec *tp); static __attribute__((unused)) int sys_gettimeofday(struct timeval *tv, struct timezone *tz) { -#ifdef __NR_gettimeofday - return my_syscall2(__NR_gettimeofday, tv, tz); -#else (void) tz; /* Non-NULL tz is undefined behaviour */ struct timespec tp; @@ -37,7 +34,6 @@ int sys_gettimeofday(struct timeval *tv, struct timezone *tz) } return ret; -#endif } static __attribute__((unused)) -- cgit v1.2.3 From ba7fd0384530e3dd20ea873aac21c473e3e461ae Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:49 +0100 Subject: tools/nolibc: prefer explicit 64-bit time-related system calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure to always use the 64-bit safe system calls in preparation for 64-bit time_t on 32-bit architectures. Also prevent issues on kernels which disable CONFIG_COMPAT_32BIT_TIME and therefore don't provide the 32-bit system calls anymore. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-5-c662992f75d7@weissschuh.net --- tools/include/nolibc/poll.h | 10 +++++----- tools/include/nolibc/sys/select.h | 10 +++++----- tools/include/nolibc/sys/timerfd.h | 12 ++++++------ tools/include/nolibc/time.h | 36 ++++++++++++++++++------------------ 4 files changed, 34 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h index 5b4fa339fbb5..e854c94647b1 100644 --- a/tools/include/nolibc/poll.h +++ b/tools/include/nolibc/poll.h @@ -23,22 +23,22 @@ static __attribute__((unused)) int sys_poll(struct pollfd *fds, int nfds, int timeout) { -#if defined(__NR_ppoll) - struct __kernel_old_timespec t; +#if defined(__NR_ppoll_time64) + struct __kernel_timespec t; if (timeout >= 0) { t.tv_sec = timeout / 1000; t.tv_nsec = (timeout % 1000) * 1000000; } - return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); + return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); #else - struct __kernel_timespec t; + struct __kernel_old_timespec t; if (timeout >= 0) { t.tv_sec = timeout / 1000; t.tv_nsec = (timeout % 1000) * 1000000; } - return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); + return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); #endif } diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h index 50b77dace7ef..f8870ad49687 100644 --- a/tools/include/nolibc/sys/select.h +++ b/tools/include/nolibc/sys/select.h @@ -63,22 +63,22 @@ typedef struct { static __attribute__((unused)) int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) { -#if defined(__NR_pselect6) - struct __kernel_old_timespec t; +#if defined(__NR_pselect6_time64) + struct __kernel_timespec t; if (timeout) { t.tv_sec = timeout->tv_sec; t.tv_nsec = timeout->tv_usec * 1000; } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); + return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); #else - struct __kernel_timespec t; + struct __kernel_old_timespec t; if (timeout) { t.tv_sec = timeout->tv_sec; t.tv_nsec = timeout->tv_usec * 1000; } - return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); #endif } diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h index 5dd61030c991..66f779553d31 100644 --- a/tools/include/nolibc/sys/timerfd.h +++ b/tools/include/nolibc/sys/timerfd.h @@ -32,9 +32,7 @@ int timerfd_create(int clockid, int flags) static __attribute__((unused)) int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) { -#if defined(__NR_timerfd_gettime) - return my_syscall2(__NR_timerfd_gettime, fd, curr_value); -#else +#if defined(__NR_timerfd_gettime64) struct __kernel_itimerspec kcurr_value; int ret; @@ -42,6 +40,8 @@ int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); return ret; +#else + return my_syscall2(__NR_timerfd_gettime, fd, curr_value); #endif } @@ -56,9 +56,7 @@ static __attribute__((unused)) int sys_timerfd_settime(int fd, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { -#if defined(__NR_timerfd_settime) - return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); -#else +#if defined(__NR_timerfd_settime64) struct __kernel_itimerspec knew_value, kold_value; int ret; @@ -70,6 +68,8 @@ int sys_timerfd_settime(int fd, int flags, __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); } return ret; +#else + return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index 48e78f8becf9..45df9b09d7b6 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -43,9 +43,7 @@ void __nolibc_timespec_kernel_to_user(const struct __kernel_timespec *kts, struc static __attribute__((unused)) int sys_clock_getres(clockid_t clockid, struct timespec *res) { -#if defined(__NR_clock_getres) - return my_syscall2(__NR_clock_getres, clockid, res); -#else +#if defined(__NR_clock_getres_time64) struct __kernel_timespec kres; int ret; @@ -53,6 +51,8 @@ int sys_clock_getres(clockid_t clockid, struct timespec *res) if (res) __nolibc_timespec_kernel_to_user(&kres, res); return ret; +#else + return my_syscall2(__NR_clock_getres, clockid, res); #endif } @@ -65,9 +65,7 @@ int clock_getres(clockid_t clockid, struct timespec *res) static __attribute__((unused)) int sys_clock_gettime(clockid_t clockid, struct timespec *tp) { -#if defined(__NR_clock_gettime) - return my_syscall2(__NR_clock_gettime, clockid, tp); -#else +#if defined(__NR_clock_gettime64) struct __kernel_timespec ktp; int ret; @@ -75,6 +73,8 @@ int sys_clock_gettime(clockid_t clockid, struct timespec *tp) if (tp) __nolibc_timespec_kernel_to_user(&ktp, tp); return ret; +#else + return my_syscall2(__NR_clock_gettime, clockid, tp); #endif } @@ -87,13 +87,13 @@ int clock_gettime(clockid_t clockid, struct timespec *tp) static __attribute__((unused)) int sys_clock_settime(clockid_t clockid, struct timespec *tp) { -#if defined(__NR_clock_settime) - return my_syscall2(__NR_clock_settime, clockid, tp); -#else +#if defined(__NR_clock_settime64) struct __kernel_timespec ktp; __nolibc_timespec_user_to_kernel(tp, &ktp); return my_syscall2(__NR_clock_settime64, clockid, &ktp); +#else + return my_syscall2(__NR_clock_settime, clockid, tp); #endif } @@ -107,9 +107,7 @@ static __attribute__((unused)) int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqtp, struct timespec *rmtp) { -#if defined(__NR_clock_nanosleep) - return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); -#else +#if defined(__NR_clock_nanosleep_time64) struct __kernel_timespec krqtp, krmtp; int ret; @@ -118,6 +116,8 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt if (rmtp) __nolibc_timespec_kernel_to_user(&krmtp, rmtp); return ret; +#else + return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); #endif } @@ -189,9 +189,7 @@ int timer_delete(timer_t timerid) static __attribute__((unused)) int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) { -#if defined(__NR_timer_gettime) - return my_syscall2(__NR_timer_gettime, timerid, curr_value); -#else +#if defined(__NR_timer_gettime64) struct __kernel_itimerspec kcurr_value; int ret; @@ -199,6 +197,8 @@ int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); return ret; +#else + return my_syscall2(__NR_timer_gettime, timerid, curr_value); #endif } @@ -212,9 +212,7 @@ static __attribute__((unused)) int sys_timer_settime(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { -#if defined(__NR_timer_settime) - return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); -#else +#if defined(__NR_timer_settime64) struct __kernel_itimerspec knew_value, kold_value; int ret; @@ -226,6 +224,8 @@ int sys_timer_settime(timer_t timerid, int flags, __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); } return ret; +#else + return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); #endif } -- cgit v1.2.3 From 7efd15d22a9b7e62d0659471bde25c35ef50c9e5 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:50 +0100 Subject: tools/nolibc/gettimeofday: avoid libgcc 64-bit divisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit timespec::tv_nsec is going to be 64-bit wide even on 32-bit architectures. As not all architectures support 64-bit division instructions, calls to libgcc (__divdi3()) may be emitted by the compiler which are not provided by nolibc. As tv_nsec is guaranteed to always fit into an uint32_t, perform a 32-bit division instead. Signed-off-by: Thomas Weißschuh Reviewed-by: Arnd Bergmann Acked-by: Willy Tarreau Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-6-c662992f75d7@weissschuh.net --- tools/include/nolibc/sys/time.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys/time.h b/tools/include/nolibc/sys/time.h index 171187836e6d..afdb7e326df1 100644 --- a/tools/include/nolibc/sys/time.h +++ b/tools/include/nolibc/sys/time.h @@ -30,7 +30,7 @@ int sys_gettimeofday(struct timeval *tv, struct timezone *tz) ret = sys_clock_gettime(CLOCK_REALTIME, &tp); if (!ret && tv) { tv->tv_sec = tp.tv_sec; - tv->tv_usec = tp.tv_nsec / 1000; + tv->tv_usec = (uint32_t)tp.tv_nsec / 1000; } return ret; -- cgit v1.2.3 From 47c17d97681d9c5d080acfdb273fa0856c930e74 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:51 +0100 Subject: tools/nolibc/select: avoid libgcc 64-bit multiplications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit timeval::tv_usec is going to be 64-bit wide even on 32-bit architectures. As not all architectures support 64-bit multiplications instructions, calls to libgcc (__multi3()) may be emitted by the compiler which are not provided by nolibc. As tv_usec and tv_nsec are guaranteed to always fit into an uint32_t, perform a 32-bit multiplication instead. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-7-c662992f75d7@weissschuh.net --- tools/include/nolibc/sys/select.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h index f8870ad49687..80cb3755ba18 100644 --- a/tools/include/nolibc/sys/select.h +++ b/tools/include/nolibc/sys/select.h @@ -68,7 +68,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva if (timeout) { t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; + t.tv_nsec = (uint32_t)timeout->tv_usec * 1000; } return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); #else @@ -76,7 +76,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva if (timeout) { t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; + t.tv_nsec = (uint32_t)timeout->tv_usec * 1000; } return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); #endif -- cgit v1.2.3 From f5aa863aea6c1ec20b85cc0b0a22e99597f0cb50 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:52 +0100 Subject: tools/nolibc: use custom structs timespec and timeval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A custom 'struct timespec' and 'struct timeval' will be necessary for 64-bit time types on 32-bit architectures. will define other time-related types in terms of the custom 'struct timespec'. Add custom struct definitions which for now mirror exactly the ones from the UAPI headers, but provide the foundation for further changes. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-8-c662992f75d7@weissschuh.net --- tools/include/nolibc/arch-s390.h | 3 +++ tools/include/nolibc/types.h | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index 74125a254ce3..5bee6ecbde0a 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -5,6 +5,9 @@ #ifndef _NOLIBC_ARCH_S390_H #define _NOLIBC_ARCH_S390_H + +#include "types.h" + #include #include diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 470a5f77bc0f..c8ed4d9cae8a 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -13,9 +13,23 @@ #include "std.h" #include #include -#include +#include #include +struct timespec { + __kernel_time_t tv_sec; + long tv_nsec; +}; +#define _STRUCT_TIMESPEC + +struct timeval { + __kernel_time_t tv_sec; + __kernel_suseconds_t tv_usec; +}; + +#define timeval __nolibc_kernel_timeval +#include +#undef timeval /* Only the generic macros and types may be defined here. The arch-specific * ones such as the O_RDONLY and related macros used by fcntl() and open() -- cgit v1.2.3 From bdcfc417f26ffd1a7e214d1cce78500dc4dbc2d5 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:53 +0100 Subject: tools/nolibc: always use 64-bit time types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 32-bit time types will stop working in 2038. Switch to 64-bit time types everywhere. Suggested-by: Arnd Bergmann Link: https://lore.kernel.org/lkml/cec27d94-c99d-4c57-9a12-275ea663dda8@app.fastmail.com/ Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-9-c662992f75d7@weissschuh.net --- tools/include/nolibc/std.h | 2 +- tools/include/nolibc/types.h | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h index 392f4dd94158..b9a116123902 100644 --- a/tools/include/nolibc/std.h +++ b/tools/include/nolibc/std.h @@ -29,6 +29,6 @@ typedef unsigned long nlink_t; typedef int64_t off_t; typedef signed long blksize_t; typedef signed long blkcnt_t; -typedef __kernel_time_t time_t; +typedef __kernel_time64_t time_t; #endif /* _NOLIBC_STD_H */ diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index c8ed4d9cae8a..8f3cb18df7f1 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -17,14 +17,15 @@ #include struct timespec { - __kernel_time_t tv_sec; - long tv_nsec; + time_t tv_sec; + int64_t tv_nsec; }; #define _STRUCT_TIMESPEC +/* Never use with system calls */ struct timeval { - __kernel_time_t tv_sec; - __kernel_suseconds_t tv_usec; + time_t tv_sec; + int64_t tv_usec; }; #define timeval __nolibc_kernel_timeval -- cgit v1.2.3 From a590a79d19046d3e0f2089f83046f3b87f880359 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 1 Jan 2026 11:34:16 -0500 Subject: rcutorture: Prevent concurrent kvm.sh runs on same source tree Add flock-based locking to kvm.sh to prevent multiple instances from running concurrently on the same source tree. This prevents build failures caused by one instance's "make clean" deleting generated files while another instance is building causing build failures. The lock file is placed in the rcutorture directory and added to .gitignore. Signed-off-by: Joel Fernandes Tested-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/.gitignore | 1 + tools/testing/selftests/rcutorture/bin/kvm.sh | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore index f6cbce77460b..b8fd42547a6e 100644 --- a/tools/testing/selftests/rcutorture/.gitignore +++ b/tools/testing/selftests/rcutorture/.gitignore @@ -3,3 +3,4 @@ initrd b[0-9]* res *.swp +.kvm.sh.lock diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index fff15821c44c..d1fbd092e22a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -275,6 +275,23 @@ do shift done +# Prevent concurrent kvm.sh runs on the same source tree. The flock +# is automatically released when the script exits, even if killed. +TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock" +if test -z "$dryrun" +then + # Create a file descriptor and flock it, so that when kvm.sh (and its + # children) exit, the flock is released by the kernel automatically. + exec 9>"$TORTURE_LOCK" + if ! flock -n 9 + then + echo "ERROR: Another kvm.sh instance is already running on this tree." + echo " Lock file: $TORTURE_LOCK" + echo " To run kvm.sh, kill all existing kvm.sh runs first." + exit 1 + fi +fi + if test -n "$dryrun" || test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh then : -- cgit v1.2.3 From cf587c6ff2d09866eb53a4620bc1aa561fb0c000 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 1 Jan 2026 11:34:17 -0500 Subject: rcutorture: Add --kill-previous option to terminate previous kvm.sh runs When kvm.sh is killed, its child processes (make, gcc, qemu, etc.) may continue running. This prevents new kvm.sh instances from starting even though the parent is gone. Add a --kill-previous option that uses fuser(1) to terminate all processes holding the flock file before attempting to acquire it. This provides a clean way to recover from stale/zombie kvm.sh runs which sometimes may have lots of qemu and compiler processes still disturbing. Signed-off-by: Joel Fernandes Tested-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm.sh | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index d1fbd092e22a..65b04b832733 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -80,6 +80,7 @@ usage () { echo " --kasan" echo " --kconfig Kconfig-options" echo " --kcsan" + echo " --kill-previous" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --memory megabytes|nnnG" @@ -206,6 +207,9 @@ do --kcsan) TORTURE_KCONFIG_KCSAN_ARG="$debuginfo CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; + --kill-previous) + TORTURE_KILL_PREVIOUS=1 + ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="`echo "$TORTURE_KMAKE_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -278,6 +282,25 @@ done # Prevent concurrent kvm.sh runs on the same source tree. The flock # is automatically released when the script exits, even if killed. TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock" + +# Terminate any processes holding the lock file, if requested. +if test -n "$TORTURE_KILL_PREVIOUS" +then + if test -e "$TORTURE_LOCK" + then + echo "Killing processes holding $TORTURE_LOCK..." + if fuser -k "$TORTURE_LOCK" >/dev/null 2>&1 + then + sleep 2 + echo "Previous kvm.sh processes killed." + else + echo "No processes were holding the lock." + fi + else + echo "No lock file exists, nothing to kill." + fi +fi + if test -z "$dryrun" then # Create a file descriptor and flock it, so that when kvm.sh (and its @@ -287,7 +310,7 @@ then then echo "ERROR: Another kvm.sh instance is already running on this tree." echo " Lock file: $TORTURE_LOCK" - echo " To run kvm.sh, kill all existing kvm.sh runs first." + echo " To run kvm.sh, kill all existing kvm.sh runs first (--kill-previous)." exit 1 fi fi -- cgit v1.2.3 From c18f35e4904920db4c51620ba634e4d175b24741 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 23 Dec 2025 20:35:38 +0900 Subject: objtool/rust: add one more `noreturn` Rust function Fix the following warning: rust/kernel.o: warning: objtool: _RNvXNtNtCs1ewLyjEZ7Le_6kernel3str9parse_intaNtNtB2_7private12FromStrRadix14from_str_radix() falls through to next function _RNvXNtNtCs1ewLyjEZ7Le_6kernel3str9parse_intaNtNtB2_7private12FromStrRadix16from_u64_negated() The commit 51d9ee90ea90 ("rust: str: add radix prefixed integer parsing functions") introduces u64::from_str_radix(), whose implementation contains a panic path for out-of-range radix values. The panic helper is core::num::from_ascii_radix_panic(). Note that radix is derived from strip_radix() here and is always within the valid range, so kernel never panics. Fixes: 51d9ee90ea90 ("rust: str: add radix prefixed integer parsing functions") Signed-off-by: FUJITA Tomonori Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Link: https://patch.msgid.link/20251223113538.1016078-1-fujita.tomonori@gmail.com [ Reworded typo. - Miguel ] Signed-off-by: Miguel Ojeda --- tools/objtool/check.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3f7999317f4d..719ec727efd4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -197,7 +197,8 @@ static bool is_rust_noreturn(const struct symbol *func) * as well as changes to the source code itself between versions (since * these come from the Rust standard library). */ - return str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail") || + return str_ends_with(func->name, "_4core3num22from_ascii_radix_panic") || + str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail") || str_ends_with(func->name, "_4core6option13expect_failed") || str_ends_with(func->name, "_4core6option13unwrap_failed") || str_ends_with(func->name, "_4core6result13unwrap_failed") || -- cgit v1.2.3 From be05f571464404432a0f8fe1c81a86a0862da283 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 28 Dec 2025 20:39:42 +0200 Subject: memblock test: include from tools mm.h stub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memblock test compilation fails: memblock.c: In function ‘memblock_validate_numa_coverage’: memblock.c:784:58: error: ‘SZ_1M’ undeclared (first use in this function) 784 | mem_size_mb = memblock_phys_mem_size() / SZ_1M; | ^~~~~ The SZ_1M is defined in sizes.h, but it is not included by stub version of mm.h in tools/include/linux. Add include of sizes.h to tools/include/linux/mm.h to fix the compilation of memblock tests. Link: https://patch.msgid.link/20251228183942.3628918-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Anshuman Khandual --- tools/include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 677c37e4a18c..028f3faf46e7 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -4,6 +4,7 @@ #include #include +#include #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -- cgit v1.2.3 From e4588c25c9d122b5847b88e18b184404b6959160 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 19 Dec 2025 16:40:13 +0100 Subject: compiler-context-analysis: Remove __cond_lock() function-like helper As discussed in [1], removing __cond_lock() will improve the readability of trylock code. Now that Sparse context tracking support has been removed, we can also remove __cond_lock(). Change existing APIs to either drop __cond_lock() completely, or make use of the __cond_acquires() function attribute instead. In particular, spinlock and rwlock implementations required switching over to inline helpers rather than statement-expressions for their trylock_* variants. Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250207082832.GU7145@noisy.programming.kicks-ass.net/ [1] Link: https://patch.msgid.link/20251219154418.3592607-25-elver@google.com --- Documentation/dev-tools/context-analysis.rst | 2 - Documentation/mm/process_addrs.rst | 6 +-- drivers/net/wireless/intel/iwlwifi/iwl-trans.c | 4 +- drivers/net/wireless/intel/iwlwifi/iwl-trans.h | 6 +-- .../wireless/intel/iwlwifi/pcie/gen1_2/internal.h | 5 +- .../net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c | 4 +- include/linux/compiler-context-analysis.h | 31 ----------- include/linux/lockref.h | 4 +- include/linux/mm.h | 33 ++---------- include/linux/rwlock.h | 11 ++-- include/linux/rwlock_api_smp.h | 14 ++++- include/linux/rwlock_rt.h | 21 ++++---- include/linux/sched/signal.h | 14 +---- include/linux/spinlock.h | 45 ++++++---------- include/linux/spinlock_api_smp.h | 20 +++++++ include/linux/spinlock_api_up.h | 61 ++++++++++++++++++---- include/linux/spinlock_rt.h | 26 +++++---- kernel/signal.c | 4 +- kernel/time/posix-timers.c | 13 ++--- lib/dec_and_lock.c | 8 +-- lib/lockref.c | 1 - mm/memory.c | 4 +- mm/pgtable-generic.c | 19 ++++--- tools/include/linux/compiler_types.h | 2 - 24 files changed, 163 insertions(+), 195 deletions(-) (limited to 'tools') diff --git a/Documentation/dev-tools/context-analysis.rst b/Documentation/dev-tools/context-analysis.rst index 8dd6c0d695aa..e69896e597b6 100644 --- a/Documentation/dev-tools/context-analysis.rst +++ b/Documentation/dev-tools/context-analysis.rst @@ -112,10 +112,8 @@ Keywords __releases_shared __acquire __release - __cond_lock __acquire_shared __release_shared - __cond_lock_shared __acquire_ret __acquire_shared_ret context_unsafe diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index 7f2f3e87071d..851680ead45f 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -583,7 +583,7 @@ To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or :c:func:`!pte_offset_map` can be used depending on stability requirements. These map the page table into kernel memory if required, take the RCU lock, and depending on variant, may also look up or acquire the PTE lock. -See the comment on :c:func:`!__pte_offset_map_lock`. +See the comment on :c:func:`!pte_offset_map_lock`. Atomicity ^^^^^^^^^ @@ -667,7 +667,7 @@ must be released via :c:func:`!pte_unmap_unlock`. .. note:: There are some variants on this, such as :c:func:`!pte_offset_map_rw_nolock` when we know we hold the PTE stable but for brevity we do not explore this. See the comment for - :c:func:`!__pte_offset_map_lock` for more details. + :c:func:`!pte_offset_map_lock` for more details. When modifying data in ranges we typically only wish to allocate higher page tables as necessary, using these locks to avoid races or overwriting anything, @@ -686,7 +686,7 @@ At the leaf page table, that is the PTE, we can't entirely rely on this pattern as we have separate PMD and PTE locks and a THP collapse for instance might have eliminated the PMD entry as well as the PTE from under us. -This is why :c:func:`!__pte_offset_map_lock` locklessly retrieves the PMD entry +This is why :c:func:`!pte_offset_map_lock` locklessly retrieves the PMD entry for the PTE, carefully checking it is as expected, before acquiring the PTE-specific lock, and then *again* checking that the PMD entry is as expected. diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c index cc8a84018f70..fa1442246662 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c @@ -548,11 +548,11 @@ int iwl_trans_read_config32(struct iwl_trans *trans, u32 ofs, return iwl_trans_pcie_read_config32(trans, ofs, val); } -bool _iwl_trans_grab_nic_access(struct iwl_trans *trans) +bool iwl_trans_grab_nic_access(struct iwl_trans *trans) { return iwl_trans_pcie_grab_nic_access(trans); } -IWL_EXPORT_SYMBOL(_iwl_trans_grab_nic_access); +IWL_EXPORT_SYMBOL(iwl_trans_grab_nic_access); void __releases(nic_access) iwl_trans_release_nic_access(struct iwl_trans *trans) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index a552669db6e2..688f9fee2821 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -1063,11 +1063,7 @@ int iwl_trans_sw_reset(struct iwl_trans *trans); void iwl_trans_set_bits_mask(struct iwl_trans *trans, u32 reg, u32 mask, u32 value); -bool _iwl_trans_grab_nic_access(struct iwl_trans *trans); - -#define iwl_trans_grab_nic_access(trans) \ - __cond_lock(nic_access, \ - likely(_iwl_trans_grab_nic_access(trans))) +bool iwl_trans_grab_nic_access(struct iwl_trans *trans); void __releases(nic_access) iwl_trans_release_nic_access(struct iwl_trans *trans); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h index 207c56e338dd..7b7b35e442f9 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h @@ -553,10 +553,7 @@ void iwl_trans_pcie_free(struct iwl_trans *trans); void iwl_trans_pcie_free_pnvm_dram_regions(struct iwl_dram_regions *dram_regions, struct device *dev); -bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent); -#define _iwl_trans_pcie_grab_nic_access(trans, silent) \ - __cond_lock(nic_access_nobh, \ - likely(__iwl_trans_pcie_grab_nic_access(trans, silent))) +bool _iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent); void iwl_trans_pcie_check_product_reset_status(struct pci_dev *pdev); void iwl_trans_pcie_check_product_reset_mode(struct pci_dev *pdev); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c index 164d060ec617..415a19ea9f06 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c @@ -2327,7 +2327,7 @@ EXPORT_SYMBOL(iwl_trans_pcie_reset); * This version doesn't disable BHs but rather assumes they're * already disabled. */ -bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent) +bool _iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent) { int ret; struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); @@ -2415,7 +2415,7 @@ bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans) bool ret; local_bh_disable(); - ret = __iwl_trans_pcie_grab_nic_access(trans, false); + ret = _iwl_trans_pcie_grab_nic_access(trans, false); if (ret) { /* keep BHs disabled until iwl_trans_pcie_release_nic_access */ return ret; diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h index cb728822343f..4f7559d7ae91 100644 --- a/include/linux/compiler-context-analysis.h +++ b/include/linux/compiler-context-analysis.h @@ -341,24 +341,6 @@ static inline void _context_unsafe_alias(void **p) { } */ #define __release(x) __release_ctx_lock(x) -/** - * __cond_lock() - function that conditionally acquires a context lock - * exclusively - * @x: context lock instance pinter - * @c: boolean expression - * - * Return: result of @c - * - * No-op function that conditionally acquires context lock instance @x - * exclusively, if the boolean expression @c is true. The result of @c is the - * return value; for example: - * - * .. code-block:: c - * - * #define spin_trylock(l) __cond_lock(&lock, _spin_trylock(&lock)) - */ -#define __cond_lock(x, c) __try_acquire_ctx_lock(x, c) - /** * __must_hold_shared() - function attribute, caller must hold shared context lock * @@ -417,19 +399,6 @@ static inline void _context_unsafe_alias(void **p) { } */ #define __release_shared(x) __release_shared_ctx_lock(x) -/** - * __cond_lock_shared() - function that conditionally acquires a context lock shared - * @x: context lock instance pinter - * @c: boolean expression - * - * Return: result of @c - * - * No-op function that conditionally acquires context lock instance @x with - * shared access, if the boolean expression @c is true. The result of @c is the - * return value. - */ -#define __cond_lock_shared(x, c) __try_acquire_shared_ctx_lock(x, c) - /** * __acquire_ret() - helper to acquire context lock of return value * @call: call expression diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 815d871fadfc..6ded24cdb4a8 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -49,9 +49,7 @@ static inline void lockref_init(struct lockref *lockref) void lockref_get(struct lockref *lockref); int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); -bool lockref_put_or_lock(struct lockref *lockref); -#define lockref_put_or_lock(_lockref) \ - (!__cond_lock((_lockref)->lock, !lockref_put_or_lock(_lockref))) +bool lockref_put_or_lock(struct lockref *lockref) __cond_acquires(false, &lockref->lock); void lockref_mark_dead(struct lockref *lockref); bool lockref_get_not_dead(struct lockref *lockref); diff --git a/include/linux/mm.h b/include/linux/mm.h index 15076261d0c2..f369cb633516 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2975,15 +2975,8 @@ static inline pud_t pud_mkspecial(pud_t pud) } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ -extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, - spinlock_t **ptl); -static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, - spinlock_t **ptl) -{ - pte_t *ptep; - __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); - return ptep; -} +extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl); #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, @@ -3337,31 +3330,15 @@ static inline bool pagetable_pte_ctor(struct mm_struct *mm, return true; } -pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); -static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, - pmd_t *pmdvalp) -{ - pte_t *pte; +pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); - __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); - return pte; -} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } -pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp); -static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp) -{ - pte_t *pte; - - __cond_lock(RCU, __cond_lock(*ptlp, - pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); - return pte; -} +pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 151f9d5f3288..65a5b55e1bcd 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -50,8 +50,8 @@ do { \ * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various * methods are defined as nops in the case they are not required. */ -#define read_trylock(lock) __cond_lock_shared(lock, _raw_read_trylock(lock)) -#define write_trylock(lock) __cond_lock(lock, _raw_write_trylock(lock)) +#define read_trylock(lock) _raw_read_trylock(lock) +#define write_trylock(lock) _raw_write_trylock(lock) #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) @@ -113,12 +113,7 @@ do { \ } while (0) #define write_unlock_bh(lock) _raw_write_unlock_bh(lock) -#define write_trylock_irqsave(lock, flags) \ - __cond_lock(lock, ({ \ - local_irq_save(flags); \ - _raw_write_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ - })) +#define write_trylock_irqsave(lock, flags) _raw_write_trylock_irqsave(lock, &(flags)) #ifdef arch_rwlock_is_contended #define rwlock_is_contended(lock) \ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 6d5cc0b7be1f..d903b17c46ca 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -26,8 +26,8 @@ unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) __acquires(lock); unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) __acquires(lock); -int __lockfunc _raw_read_trylock(rwlock_t *lock); -int __lockfunc _raw_write_trylock(rwlock_t *lock); +int __lockfunc _raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); +int __lockfunc _raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); void __lockfunc _raw_read_unlock(rwlock_t *lock) __releases_shared(lock); void __lockfunc _raw_write_unlock(rwlock_t *lock) __releases(lock); void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) __releases_shared(lock); @@ -41,6 +41,16 @@ void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); +static inline bool _raw_write_trylock_irqsave(rwlock_t *lock, unsigned long *flags) + __cond_acquires(true, lock) +{ + local_irq_save(*flags); + if (_raw_write_trylock(lock)) + return true; + local_irq_restore(*flags); + return false; +} + #ifdef CONFIG_INLINE_READ_LOCK #define _raw_read_lock(lock) __raw_read_lock(lock) #endif diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h index f64d6d319a47..37b387dcab21 100644 --- a/include/linux/rwlock_rt.h +++ b/include/linux/rwlock_rt.h @@ -26,11 +26,11 @@ do { \ } while (0) extern void rt_read_lock(rwlock_t *rwlock) __acquires_shared(rwlock); -extern int rt_read_trylock(rwlock_t *rwlock); +extern int rt_read_trylock(rwlock_t *rwlock) __cond_acquires_shared(true, rwlock); extern void rt_read_unlock(rwlock_t *rwlock) __releases_shared(rwlock); extern void rt_write_lock(rwlock_t *rwlock) __acquires(rwlock); extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(rwlock); -extern int rt_write_trylock(rwlock_t *rwlock); +extern int rt_write_trylock(rwlock_t *rwlock) __cond_acquires(true, rwlock); extern void rt_write_unlock(rwlock_t *rwlock) __releases(rwlock); static __always_inline void read_lock(rwlock_t *rwlock) @@ -59,7 +59,7 @@ static __always_inline void read_lock_irq(rwlock_t *rwlock) flags = 0; \ } while (0) -#define read_trylock(lock) __cond_lock_shared(lock, rt_read_trylock(lock)) +#define read_trylock(lock) rt_read_trylock(lock) static __always_inline void read_unlock(rwlock_t *rwlock) __releases_shared(rwlock) @@ -123,14 +123,15 @@ static __always_inline void write_lock_irq(rwlock_t *rwlock) flags = 0; \ } while (0) -#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) +#define write_trylock(lock) rt_write_trylock(lock) -#define write_trylock_irqsave(lock, flags) \ - __cond_lock(lock, ({ \ - typecheck(unsigned long, flags); \ - flags = 0; \ - rt_write_trylock(lock); \ - })) +static __always_inline bool _write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) + __cond_acquires(true, rwlock) +{ + *flags = 0; + return rt_write_trylock(rwlock); +} +#define write_trylock_irqsave(lock, flags) _write_trylock_irqsave(lock, &(flags)) static __always_inline void write_unlock(rwlock_t *rwlock) __releases(rwlock) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 7d6449982822..a63f65aa5bdd 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -737,18 +737,8 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) -extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, - unsigned long *flags); - -static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, - unsigned long *flags) -{ - struct sighand_struct *ret; - - ret = __lock_task_sighand(task, flags); - (void)__cond_lock(&task->sighand->siglock, ret); - return ret; -} +extern struct sighand_struct *lock_task_sighand(struct task_struct *task, + unsigned long *flags); static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 7e560c7a7b23..396b8c5d6c1b 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -213,7 +213,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) * various methods are defined as nops in the case they are not * required. */ -#define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) +#define raw_spin_trylock(lock) _raw_spin_trylock(lock) #define raw_spin_lock(lock) _raw_spin_lock(lock) @@ -284,22 +284,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) } while (0) #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) -#define raw_spin_trylock_bh(lock) \ - __cond_lock(lock, _raw_spin_trylock_bh(lock)) +#define raw_spin_trylock_bh(lock) _raw_spin_trylock_bh(lock) -#define raw_spin_trylock_irq(lock) \ - __cond_lock(lock, ({ \ - local_irq_disable(); \ - _raw_spin_trylock(lock) ? \ - 1 : ({ local_irq_enable(); 0; }); \ - })) +#define raw_spin_trylock_irq(lock) _raw_spin_trylock_irq(lock) -#define raw_spin_trylock_irqsave(lock, flags) \ - __cond_lock(lock, ({ \ - local_irq_save(flags); \ - _raw_spin_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ - })) +#define raw_spin_trylock_irqsave(lock, flags) _raw_spin_trylock_irqsave(lock, &(flags)) #ifndef CONFIG_PREEMPT_RT /* Include rwlock functions for !RT */ @@ -433,8 +422,12 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock) return raw_spin_trylock_irq(&lock->rlock); } -#define spin_trylock_irqsave(lock, flags) \ - __cond_lock(lock, raw_spin_trylock_irqsave(spinlock_check(lock), flags)) +static __always_inline bool _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) + __cond_acquires(true, lock) __no_context_analysis +{ + return raw_spin_trylock_irqsave(spinlock_check(lock), *flags); +} +#define spin_trylock_irqsave(lock, flags) _spin_trylock_irqsave(lock, &(flags)) /** * spin_is_locked() - Check whether a spinlock is locked. @@ -512,23 +505,17 @@ static inline int rwlock_needbreak(rwlock_t *lock) * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ -extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); -#define atomic_dec_and_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) +extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) __cond_acquires(true, lock); extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, - unsigned long *flags); -#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ - __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) + unsigned long *flags) __cond_acquires(true, lock); +#define atomic_dec_and_lock_irqsave(atomic, lock, flags) _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)) -extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock); -#define atomic_dec_and_raw_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock)) +extern int atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) __cond_acquires(true, lock); extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, - unsigned long *flags); -#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \ - __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))) + unsigned long *flags) __cond_acquires(true, lock); +#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags)) int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 7e7d7d373213..bda5e7a390cd 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -95,6 +95,26 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) return 0; } +static __always_inline bool _raw_spin_trylock_irq(raw_spinlock_t *lock) + __cond_acquires(true, lock) +{ + local_irq_disable(); + if (_raw_spin_trylock(lock)) + return true; + local_irq_enable(); + return false; +} + +static __always_inline bool _raw_spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags) + __cond_acquires(true, lock) +{ + local_irq_save(*flags); + if (_raw_spin_trylock(lock)) + return true; + local_irq_restore(*flags); + return false; +} + /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index 018f5aabc1be..a9d5c7c66e03 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -24,14 +24,11 @@ * flags straight, to suppress compiler warnings of unused lock * variables, and to add the proper checker annotations: */ -#define ___LOCK_void(lock) \ - do { (void)(lock); } while (0) - #define ___LOCK_(lock) \ - do { __acquire(lock); ___LOCK_void(lock); } while (0) + do { __acquire(lock); (void)(lock); } while (0) #define ___LOCK_shared(lock) \ - do { __acquire_shared(lock); ___LOCK_void(lock); } while (0) + do { __acquire_shared(lock); (void)(lock); } while (0) #define __LOCK(lock, ...) \ do { preempt_disable(); ___LOCK_##__VA_ARGS__(lock); } while (0) @@ -78,10 +75,56 @@ #define _raw_spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) #define _raw_read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags, shared) #define _raw_write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _raw_spin_trylock(lock) ({ __LOCK(lock, void); 1; }) -#define _raw_read_trylock(lock) ({ __LOCK(lock, void); 1; }) -#define _raw_write_trylock(lock) ({ __LOCK(lock, void); 1; }) -#define _raw_spin_trylock_bh(lock) ({ __LOCK_BH(lock, void); 1; }) + +static __always_inline int _raw_spin_trylock(raw_spinlock_t *lock) + __cond_acquires(true, lock) +{ + __LOCK(lock); + return 1; +} + +static __always_inline int _raw_spin_trylock_bh(raw_spinlock_t *lock) + __cond_acquires(true, lock) +{ + __LOCK_BH(lock); + return 1; +} + +static __always_inline int _raw_spin_trylock_irq(raw_spinlock_t *lock) + __cond_acquires(true, lock) +{ + __LOCK_IRQ(lock); + return 1; +} + +static __always_inline int _raw_spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags) + __cond_acquires(true, lock) +{ + __LOCK_IRQSAVE(lock, *(flags)); + return 1; +} + +static __always_inline int _raw_read_trylock(rwlock_t *lock) + __cond_acquires_shared(true, lock) +{ + __LOCK(lock, shared); + return 1; +} + +static __always_inline int _raw_write_trylock(rwlock_t *lock) + __cond_acquires(true, lock) +{ + __LOCK(lock); + return 1; +} + +static __always_inline int _raw_write_trylock_irqsave(rwlock_t *lock, unsigned long *flags) + __cond_acquires(true, lock) +{ + __LOCK_IRQSAVE(lock, *(flags)); + return 1; +} + #define _raw_spin_unlock(lock) __UNLOCK(lock) #define _raw_read_unlock(lock) __UNLOCK(lock, shared) #define _raw_write_unlock(lock) __UNLOCK(lock) diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 6bab73ee1384..0a585768358f 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -37,8 +37,8 @@ extern void rt_spin_lock_nested(spinlock_t *lock, int subclass) __acquires(lock) extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) __acquires(lock); extern void rt_spin_unlock(spinlock_t *lock) __releases(lock); extern void rt_spin_lock_unlock(spinlock_t *lock); -extern int rt_spin_trylock_bh(spinlock_t *lock); -extern int rt_spin_trylock(spinlock_t *lock); +extern int rt_spin_trylock_bh(spinlock_t *lock) __cond_acquires(true, lock); +extern int rt_spin_trylock(spinlock_t *lock) __cond_acquires(true, lock); static __always_inline void spin_lock(spinlock_t *lock) __acquires(lock) @@ -130,21 +130,19 @@ static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, rt_spin_unlock(lock); } -#define spin_trylock(lock) \ - __cond_lock(lock, rt_spin_trylock(lock)) +#define spin_trylock(lock) rt_spin_trylock(lock) -#define spin_trylock_bh(lock) \ - __cond_lock(lock, rt_spin_trylock_bh(lock)) +#define spin_trylock_bh(lock) rt_spin_trylock_bh(lock) -#define spin_trylock_irq(lock) \ - __cond_lock(lock, rt_spin_trylock(lock)) +#define spin_trylock_irq(lock) rt_spin_trylock(lock) -#define spin_trylock_irqsave(lock, flags) \ - __cond_lock(lock, ({ \ - typecheck(unsigned long, flags); \ - flags = 0; \ - rt_spin_trylock(lock); \ - })) +static __always_inline bool _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) + __cond_acquires(true, lock) +{ + *flags = 0; + return rt_spin_trylock(lock); +} +#define spin_trylock_irqsave(lock, flags) _spin_trylock_irqsave(lock, &(flags)) #define spin_is_contended(lock) (((void)(lock), 0)) diff --git a/kernel/signal.c b/kernel/signal.c index e42b8bd6922f..d65d0fe24bfb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1355,8 +1355,8 @@ int zap_other_threads(struct task_struct *p) return count; } -struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, - unsigned long *flags) +struct sighand_struct *lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) { struct sighand_struct *sighand; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 80a8a09a21a0..413e2389f0a5 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -66,14 +66,7 @@ static const struct k_clock clock_realtime, clock_monotonic; #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id); - -#define lock_timer(tid) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ - __timr; \ -}) - +static struct k_itimer *lock_timer(timer_t timer_id); static inline void unlock_timer(struct k_itimer *timr) { if (likely((timr))) @@ -85,7 +78,7 @@ static inline void unlock_timer(struct k_itimer *timr) #define scoped_timer (scope) -DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id); DEFINE_CLASS_IS_COND_GUARD(lock_timer); static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) @@ -600,7 +593,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id) +static struct k_itimer *lock_timer(timer_t timer_id) { struct k_itimer *timr; diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index 1dcca8f2e194..8c7c398fd770 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -18,7 +18,7 @@ * because the spin-lock and the decrement must be * "atomic". */ -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) @@ -32,7 +32,7 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(atomic_dec_and_lock); int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags) @@ -50,7 +50,7 @@ int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); -int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) +int atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) @@ -63,7 +63,7 @@ int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) raw_spin_unlock(lock); return 0; } -EXPORT_SYMBOL(_atomic_dec_and_raw_lock); +EXPORT_SYMBOL(atomic_dec_and_raw_lock); int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags) diff --git a/lib/lockref.c b/lib/lockref.c index 9210fc6ae714..5d8e3ef3860e 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -105,7 +105,6 @@ EXPORT_SYMBOL(lockref_put_return); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken */ -#undef lockref_put_or_lock bool lockref_put_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..b751e1f85abc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2210,8 +2210,8 @@ static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, - spinlock_t **ptl) +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) { pmd_t *pmd = walk_to_pmd(mm, addr); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index d3aec7a9926a..af7966169d69 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -280,7 +280,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; @@ -332,13 +332,12 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, } /* - * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation - * __pte_offset_map_lock() below, is usually called with the pmd pointer for - * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while - * holding mmap_lock or vma lock for read or for write; or in truncate or rmap - * context, while holding file's i_mmap_lock or anon_vma lock for read (or for - * write). In a few cases, it may be used with pmd pointing to a pmd_t already - * copied to or constructed on the stack. + * pte_offset_map_lock(mm, pmd, addr, ptlp) is usually called with the pmd + * pointer for addr, reached by walking down the mm's pgd, p4d, pud for addr: + * either while holding mmap_lock or vma lock for read or for write; or in + * truncate or rmap context, while holding file's i_mmap_lock or anon_vma lock + * for read (or for write). In a few cases, it may be used with pmd pointing to + * a pmd_t already copied to or constructed on the stack. * * When successful, it returns the pte pointer for addr, with its page table * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent @@ -389,8 +388,8 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, * table, and may not use RCU at all: "outsiders" like khugepaged should avoid * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. */ -pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp) +pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) { spinlock_t *ptl; pmd_t pmdval; diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index d09f9dc172a4..067a5b4e0f7b 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -20,7 +20,6 @@ # define __releases(x) __attribute__((context(x,1,0))) # define __acquire(x) __context__(x,1) # define __release(x) __context__(x,-1) -# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) #else /* __CHECKER__ */ /* context/locking */ # define __must_hold(x) @@ -28,7 +27,6 @@ # define __releases(x) # define __acquire(x) (void)0 # define __release(x) (void)0 -# define __cond_lock(x,c) (c) #endif /* __CHECKER__ */ /* Compiler specific macros. */ -- cgit v1.2.3 From 623ba6ea45979fb1d06c5c8f03417ecc3565a851 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Mon, 5 Jan 2026 15:00:57 +0000 Subject: perf symbol: Remove Rust symbol workarounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to an off-by-one error introduced in commit 73bbb94466fd3f8b ("kallsyms: support "big" kernel symbols"), long symbols (which are currently only produced by Rust) can have their symbol type being wrongly parsed by kernel/kallsyms.c. This has been fixed in commit f3f9f42232dee596 ("kallsyms: Fix wrong "big" kernel symbol type read from procfs"), and these symbols are now reported correctly. Drop the workaround in perf symbol that filter out these symbol types. Specifically, '1' and 'l' can never be generated by nm -- 'u' does indicate GNU unique, however such symbols are only generated by G++ for C++ templates, and are never generated by LLVM (LLVM generates weak symbols in such cases instead). 'N' can appear if symbols exist inside debug sections, and 'n' may appear for symbols inside note sections, however these sections do not typically have symbol (and they're explicitly filtered out by kallsyms). Therefore, the previous occurrence of these symbols types must be due to the off-by-one error and can be safely removed. Signed-off-by: Gary Guo Acked-by: Miguel Ojeda Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alice Ryhl Cc: Andi Kleen Cc: Andreas Hindborg Cc: Benno Lossin Cc: Bill Wendling Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Dmitriy Vyukov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Justin Stitt Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Trevor Gross Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 814f960fa8f8..8662001e1e25 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -104,21 +104,10 @@ static enum dso_binary_type binary_type_symtab[] = { #define DSO_BINARY_TYPE__SYMTAB_CNT ARRAY_SIZE(binary_type_symtab) -static bool symbol_type__filter(char __symbol_type) -{ - // Since 'U' == undefined and 'u' == unique global symbol, we can't use toupper there - // 'N' is for debugging symbols, 'n' is a non-data, non-code, non-debug read-only section. - // According to 'man nm'. - // 'N' first seen in: - // ffffffff9b35d130 N __pfx__RNCINvNtNtNtCsbDUBuN8AbD4_4core4iter8adapters3map12map_try_foldjNtCs6vVzKs5jPr6_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ - // a seemingly Rust mangled name - // Ditto for '1': - // root@x1:~# grep ' 1 ' /proc/kallsyms - // ffffffffb098bc00 1 __pfx__RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ - // ffffffffb098bc10 1 _RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ - char symbol_type = toupper(__symbol_type); - return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B' || - __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N' || __symbol_type == '1'; +static bool symbol_type__filter(char symbol_type) +{ + symbol_type = toupper(symbol_type); + return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B'; } static int prefix_underscores_count(const char *str) -- cgit v1.2.3 From f2546eba53bbe38c4bb950f78625ccf4b1a2cbc8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:15 -0800 Subject: cxl/mem: Drop @host argument to devm_cxl_add_memdev() In all cases the device that created the 'struct cxl_dev_state' instance is also the device to host the devm cleanup of devm_cxl_add_memdev(). This simplifies the function prototype, and limits a degree of freedom of the API. Cc: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-6-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 3 +-- drivers/cxl/cxlmem.h | 6 ++---- drivers/cxl/mem.c | 9 +++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 92aea95859fb..935a163f1527 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1093,8 +1093,7 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { struct device *dev; int rc; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 012e68acad34..9db31c7993c4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -95,10 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index d62931526fd4..677996c65272 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -165,17 +165,18 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device - * @host: devres alloc/release context and parent for the memdev * @cxlds: CXL device state to associate with the memdev * * Upon return the device will have had a chance to attach to the * cxl_mem driver, but may fail if the CXL topology is not ready * (hardware CXL link down, or software platform CXL root not attached) + * + * The parent of the resulting device and the devm context for allocations is + * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { - return __devm_cxl_add_memdev(host, cxlds); + return __devm_cxl_add_memdev(cxlds); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe..1c6fc5334806 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 176dcde570cd..8a22b7601627 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); -- cgit v1.2.3 From 29317f8dc6ed601ec54575689c2cd55cc470bcce Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:16 -0800 Subject: cxl/mem: Introduce cxl_memdev_attach for CXL-dependent operation Unlike the cxl_pci class driver that opportunistically enables memory expansion with no other dependent functionality, CXL accelerator drivers have distinct PCIe-only and CXL-enhanced operation states. If CXL is available some additional coherent memory/cache operations can be enabled, otherwise traditional DMA+MMIO over PCIe/CXL.io is a fallback. This constitutes a new mode of operation where the caller of devm_cxl_add_memdev() wants to make a "go/no-go" decision about running in CXL accelerated mode or falling back to PCIe-only operation. Part of that decision making process likely also includes additional CXL-acceleration-specific resource setup. Encapsulate both of those requirements into 'struct cxl_memdev_attach' that provides a ->probe() callback. The probe callback runs in cxl_mem_probe() context, after the port topology is successfully attached for the given memdev. It supports a contract where, upon successful return from devm_cxl_add_memdev(), everything needed for CXL accelerated operation has been enabled. Additionally the presence of @cxlmd->attach indicates that the accelerator driver be detached when CXL operation ends. This conceptually makes a CXL link loss event mirror a PCIe link loss event which results in triggering the ->remove() callback of affected devices+drivers. A driver can re-attach to recover back to PCIe-only operation. Live recovery, i.e. without a ->remove()/->probe() cycle, is left as a future consideration. [ dj: Repalce with updated commit log from Dan ] Cc: Smita Koralahalli Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251216005616.3090129-7-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 33 +++++++++++++++++++++++++++++---- drivers/cxl/cxlmem.h | 12 ++++++++++-- drivers/cxl/mem.c | 20 ++++++++++++++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 57 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 935a163f1527..af3d0cc65138 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -641,14 +641,24 @@ static void detach_memdev(struct work_struct *work) struct cxl_memdev *cxlmd; cxlmd = container_of(work, typeof(*cxlmd), detach_work); - device_release_driver(&cxlmd->dev); + + /* + * When the creator of @cxlmd sets ->attach it indicates CXL operation + * is required. In that case, @cxlmd detach escalates to parent device + * detach. + */ + if (cxlmd->attach) + device_release_driver(cxlmd->dev.parent); + else + device_release_driver(&cxlmd->dev); put_device(&cxlmd->dev); } static struct lock_class_key cxl_memdev_key; static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, - const struct file_operations *fops) + const struct file_operations *fops, + const struct cxl_memdev_attach *attach) { struct cxl_memdev *cxlmd; struct device *dev; @@ -664,6 +674,8 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, goto err; cxlmd->id = rc; cxlmd->depth = -1; + cxlmd->attach = attach; + cxlmd->endpoint = ERR_PTR(-ENXIO); dev = &cxlmd->dev; device_initialize(dev); @@ -1081,6 +1093,18 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) { int rc; + /* + * If @attach is provided fail if the driver is not attached upon + * return. Note that failure here could be the result of a race to + * teardown the CXL port topology. I.e. cxl_mem_probe() could have + * succeeded and then cxl_mem unbound before the lock is acquired. + */ + guard(device)(&cxlmd->dev); + if (cxlmd->attach && !cxlmd->dev.driver) { + cxl_memdev_unregister(cxlmd); + return ERR_PTR(-ENXIO); + } + rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister, cxlmd); if (rc) @@ -1093,13 +1117,14 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { struct device *dev; int rc; struct cxl_memdev *cxlmd __free(put_cxlmd) = - cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + cxl_memdev_alloc(cxlds, &cxl_memdev_fops, attach); if (IS_ERR(cxlmd)) return cxlmd; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 9db31c7993c4..ef202b34e5ea 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,6 +34,10 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -43,6 +47,7 @@ * @cxl_nvb: coordinate removal of @cxl_nvd if present * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem * @endpoint: connection to the CXL port topology for this memory device + * @attach: creator of this memdev depends on CXL link attach to operate * @id: id number of this memdev instance. * @depth: endpoint port depth * @scrub_cycle: current scrub cycle set for this device @@ -59,6 +64,7 @@ struct cxl_memdev { struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_nvdimm *cxl_nvd; struct cxl_port *endpoint; + const struct cxl_memdev_attach *attach; int id; int depth; u8 scrub_cycle; @@ -95,8 +101,10 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 677996c65272..333c366b69e7 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -142,6 +142,12 @@ static int cxl_mem_probe(struct device *dev) return rc; } + if (cxlmd->attach) { + rc = cxlmd->attach->probe(cxlmd); + if (rc) + return rc; + } + rc = devm_cxl_memdev_edac_register(cxlmd); if (rc) dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc); @@ -166,17 +172,23 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device * @cxlds: CXL device state to associate with the memdev + * @attach: Caller depends on CXL topology attachment * * Upon return the device will have had a chance to attach to the - * cxl_mem driver, but may fail if the CXL topology is not ready - * (hardware CXL link down, or software platform CXL root not attached) + * cxl_mem driver, but may fail to attach if the CXL topology is not ready + * (hardware CXL link down, or software platform CXL root not attached). + * + * When @attach is NULL it indicates the caller wants the memdev to remain + * registered even if it does not immediately attach to the CXL hierarchy. When + * @attach is provided a cxl_mem_probe() failure leads to failure of this routine. * * The parent of the resulting device and the devm context for allocations is * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { - return __devm_cxl_add_memdev(cxlds); + return __devm_cxl_add_memdev(cxlds, attach); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 1c6fc5334806..549368a9c868 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8a22b7601627..cb87e8c0e63c 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); -- cgit v1.2.3 From fb36d71308a770268c771d6697f22615e5ddbd6e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 19 Dec 2025 15:29:42 +0000 Subject: kselftest/arm64: Support FORCE_TARGETS The top level kselftest Makefile supports an option FORCE_TARGETS which causes any failures during the build to be propagated to the exit status of the top level make, useful during build testing. Currently the recursion done by the arm64 selftests ignores this option, meaning arm64 failures are not reported via this mechanism. Add the logic to implement FORCE_TARGETS so that it works for arm64. Signed-off-by: Mark Brown Acked-by: Shuah Khan Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index c4c72ee2ef55..e456f3b62fa1 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -30,13 +30,15 @@ all: @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir -p $$BUILD_TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \ + $(if $(FORCE_TARGETS),|| exit); \ done install: all @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \ + $(if $(FORCE_TARGETS),|| exit); \ done run_tests: all -- cgit v1.2.3 From 5c7a4741431b0a938dcbd22b90a4dc9a2903fc00 Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Tue, 6 Jan 2026 01:41:01 +0900 Subject: kunit: respect KBUILD_OUTPUT env variable by default Currently, kunit.py ignores the KBUILD_OUTPUT env variable and always defaults to .kunit in the working directory. This behavior is inconsistent with standard Kbuild behavior, where KBUILD_OUTPUT defines the build artifact location. This patch modifies kunit.py to respect KBUILD_OUTPUT if set. A .kunit subdirectory is created inside KBUILD_OUTPUT to avoid polluting the build directory. Link: https://lore.kernel.org/r/20260106-kunit-kbuild_output-v2-1-582281797343@gmail.com Reviewed-by: David Gow Signed-off-by: Ryota Sakamoto Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 7 ++++++- tools/testing/kunit/kunit_tool_test.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index cd99c1956331..e3d82a038f93 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -323,11 +323,16 @@ def get_default_jobs() -> int: return ncpu raise RuntimeError("os.cpu_count() returned None") +def get_default_build_dir() -> str: + if 'KBUILD_OUTPUT' in os.environ: + return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit') + return '.kunit' + def add_common_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' 'directory.', - type=str, default='.kunit', metavar='DIR') + type=str, default=get_default_build_dir(), metavar='DIR') parser.add_argument('--make_options', help='X=Y make option, can be repeated.', action='append', metavar='X=Y') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index bbba921e0eac..a55b5085310d 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -601,6 +601,7 @@ class KUnitMainTest(unittest.TestCase): all_passed_log = file.readlines() self.print_mock = mock.patch('kunit_printer.Printer.print').start() + mock.patch.dict(os.environ, clear=True).start() self.addCleanup(mock.patch.stopall) self.mock_linux_init = mock.patch.object(kunit_kernel, 'LinuxSourceTree').start() @@ -723,6 +724,24 @@ class KUnitMainTest(unittest.TestCase): args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) + @mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'}) + def test_run_builddir_from_env(self): + build_dir = '/tmp/.kunit' + kunit.main(['run']) + self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + + @mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'}) + def test_run_builddir_override(self): + build_dir = '.kunit' + kunit.main(['run', '--build_dir=.kunit']) + self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_config_builddir(self): build_dir = '.kunit' kunit.main(['config', '--build_dir', build_dir]) -- cgit v1.2.3 From 0c5b86c67fb6898d02c8f92de884186297fd302f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 30 Dec 2025 13:26:35 +0100 Subject: kunit: tool: Add test for nested test result reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is a lack of tests validating the result reporting from nested tests. Add one, it will also be used to validate upcoming changes to the nested test parsing. Link: https://lore.kernel.org/r/20251230-kunit-nested-failure-v1-1-98cfbeb87823@linutronix.de Signed-off-by: Thomas Weißschuh Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 10 ++++++++++ .../kunit/test_data/test_is_test_passed-failure-nested.log | 7 +++++++ 2 files changed, 17 insertions(+) create mode 100644 tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log (limited to 'tools') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index a55b5085310d..81a0996edef4 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -165,6 +165,16 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.errors, 0) + def test_parse_failed_nested_tests_log(self): + nested_log = test_data_path('test_is_test_passed-failure-nested.log') + with open(nested_log) as file: + result = kunit_parser.parse_run_tests(file.readlines(), stdout) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) + self.assertEqual(result.counts.failed, 2) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) + def test_no_header(self): empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log') with open(empty_log) as file: diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log new file mode 100644 index 000000000000..2e528da39ab5 --- /dev/null +++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log @@ -0,0 +1,7 @@ +KTAP version 1 +1..2 +not ok 1 subtest 1 + KTAP version 1 + 1..1 + not ok 1 subsubtest 1 +not ok 2 subtest 2 -- cgit v1.2.3 From 85aff81b0dba7c42d226d9f7c11c4d30a7906878 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 30 Dec 2025 13:26:36 +0100 Subject: kunit: tool: Don't overwrite test status based on subtest counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a subtest itself reports success, but the outer testcase fails, the whole testcase should be reported as a failure. However the status is recalculated based on the test counts, overwriting the outer test result. Synthesize a failed test in this case to make sure the failure is not swallowed. Link: https://lore.kernel.org/r/20251230-kunit-nested-failure-v1-2-98cfbeb87823@linutronix.de Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_parser.py | 3 +++ tools/testing/kunit/kunit_tool_test.py | 1 + tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log | 3 +++ 3 files changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index 333cd3a4a56b..5338489dcbe4 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -689,6 +689,9 @@ def bubble_up_test_results(test: Test) -> None: elif test.counts.get_status() == TestStatus.TEST_CRASHED: test.status = TestStatus.TEST_CRASHED + if status == TestStatus.FAILURE and test.counts.get_status() == TestStatus.SUCCESS: + counts.add_status(status) + def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: bool, printer: Printer) -> Test: """ Finds next test to parse in LineStream, creates new Test object, diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 81a0996edef4..bdc51b5c7b10 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -172,6 +172,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.failed, 2) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status) + self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.subtests[0].subtests[0].status) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log index 2e528da39ab5..5498dfd0b0db 100644 --- a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log +++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log @@ -1,5 +1,8 @@ KTAP version 1 1..2 + KTAP version 1 + 1..1 + ok 1 test 1 not ok 1 subtest 1 KTAP version 1 1..1 -- cgit v1.2.3 From ab150c2bbafe9425759eca1e45e08d4ad1456818 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 2 Jan 2026 08:20:39 +0100 Subject: kunit: qemu_configs: Add 32-bit big endian ARM configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a basic config to run kunit tests on 32-bit big endian ARM. Link: https://lore.kernel.org/r/20260102-kunit-armeb-v1-1-e8e5475d735c@linutronix.de Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/qemu_configs/armeb.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tools/testing/kunit/qemu_configs/armeb.py (limited to 'tools') diff --git a/tools/testing/kunit/qemu_configs/armeb.py b/tools/testing/kunit/qemu_configs/armeb.py new file mode 100644 index 000000000000..86d326651490 --- /dev/null +++ b/tools/testing/kunit/qemu_configs/armeb.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='arm', + kconfig=''' +CONFIG_CPU_BIG_ENDIAN=y +CONFIG_ARCH_VIRT=y +CONFIG_SERIAL_AMBA_PL010=y +CONFIG_SERIAL_AMBA_PL010_CONSOLE=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y''', + qemu_arch='arm', + kernel_path='arch/arm/boot/zImage', + kernel_command_line='console=ttyAMA0', + extra_qemu_params=['-machine', 'virt']) -- cgit v1.2.3 From 11aa4a18094f04a8ba7e403c272a9a5d85c9c9fc Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Sun, 12 Oct 2025 10:11:30 +0300 Subject: tools/rtla: Remove unused function declarations Historically four function declarations remain orphaned or duplicated. Remove them to keep the source clean. Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251012071133.290225-1-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/osnoise.h | 3 --- tools/tracing/rtla/src/utils.h | 1 - 2 files changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h index 895687030c0b..75de0d5c706a 100644 --- a/tools/tracing/rtla/src/osnoise.h +++ b/tools/tracing/rtla/src/osnoise.h @@ -58,8 +58,6 @@ int osnoise_set_irq_disable(struct osnoise_context *context, bool onoff); void osnoise_report_missed_events(struct osnoise_tool *tool); int osnoise_apply_config(struct osnoise_tool *tool, struct osnoise_params *params); -int osnoise_hist_main(int argc, char *argv[]); -int osnoise_top_main(int argc, char **argv); int osnoise_enable(struct osnoise_tool *tool); int osnoise_main(int argc, char **argv); int hwnoise_main(int argc, char **argv); @@ -68,4 +66,3 @@ extern struct tool_ops timerlat_top_ops, timerlat_hist_ops; extern struct tool_ops osnoise_top_ops, osnoise_hist_ops; int run_tool(struct tool_ops *ops, int argc, char *argv[]); -int hist_main_loop(struct osnoise_tool *tool); diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h index 091df4ba4587..ed7618842e82 100644 --- a/tools/tracing/rtla/src/utils.h +++ b/tools/tracing/rtla/src/utils.h @@ -24,7 +24,6 @@ void fatal(const char *fmt, ...); long parse_seconds_duration(char *val); void get_duration(time_t start_time, char *output, int output_size); -int parse_cpu_list(char *cpu_list, char **monitored_cpus); char *parse_optional_arg(int argc, char **argv); long long get_llong_from_str(char *start); -- cgit v1.2.3 From ca7206b6ad029d2c35e64f1ea81dba385496e630 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:54 +0100 Subject: selftests/nolibc: test compatibility of nolibc and kernel time types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keeping 'struct timespec' and 'struct __kernel_timespec' compatible allows the source code to stay simple. Validate that the types stay compatible. The test is specific to nolibc and does not compile on other libcs, so skip it there. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-10-c662992f75d7@weissschuh.net --- tools/testing/selftests/nolibc/nolibc-test.c | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6888b20af259..3986d55a6ff6 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1430,6 +1430,34 @@ int test_difftime(void) return 0; } +int test_time_types(void) +{ +#ifdef NOLIBC + struct __kernel_timespec kts; + struct timespec ts; + + if (!__builtin_types_compatible_p(time_t, __kernel_time64_t)) + return 1; + + if (sizeof(ts) != sizeof(kts)) + return 1; + + if (!__builtin_types_compatible_p(__typeof__(ts.tv_sec), __typeof__(kts.tv_sec))) + return 1; + + if (!__builtin_types_compatible_p(__typeof__(ts.tv_nsec), __typeof__(kts.tv_nsec))) + return 1; + + if (offsetof(__typeof__(ts), tv_sec) != offsetof(__typeof__(kts), tv_sec)) + return 1; + + if (offsetof(__typeof__(ts), tv_nsec) != offsetof(__typeof__(kts), tv_nsec)) + return 1; +#endif /* NOLIBC */ + + return 0; +} + int run_stdlib(int min, int max) { int test; @@ -1555,6 +1583,7 @@ int run_stdlib(int min, int max) CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break; CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break; CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break; + CASE_TEST(time_types); EXPECT_ZR(is_nolibc, test_time_types()); break; case __LINE__: return ret; /* must be last */ -- cgit v1.2.3 From 6c9be90527207f9beca78e698dd45969813f4c0e Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:55 +0100 Subject: tools/nolibc: remove time conversions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that 'struct timespec' and 'struct __kernel_timespec' are compatible, the conversions are not necessary anymore. The same holds true for 'struct itimerspec' and 'struct __kernel_itimerspec'. Remove the conversions. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-11-c662992f75d7@weissschuh.net --- tools/include/nolibc/sys/timerfd.h | 20 ++---------- tools/include/nolibc/time.h | 64 ++++---------------------------------- 2 files changed, 8 insertions(+), 76 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h index 66f779553d31..616fcfb416a9 100644 --- a/tools/include/nolibc/sys/timerfd.h +++ b/tools/include/nolibc/sys/timerfd.h @@ -33,13 +33,7 @@ static __attribute__((unused)) int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) { #if defined(__NR_timerfd_gettime64) - struct __kernel_itimerspec kcurr_value; - int ret; - - ret = my_syscall2(__NR_timerfd_gettime64, fd, &kcurr_value); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); - return ret; + return my_syscall2(__NR_timerfd_gettime64, fd, curr_value); #else return my_syscall2(__NR_timerfd_gettime, fd, curr_value); #endif @@ -57,17 +51,7 @@ int sys_timerfd_settime(int fd, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { #if defined(__NR_timerfd_settime64) - struct __kernel_itimerspec knew_value, kold_value; - int ret; - - __nolibc_timespec_user_to_kernel(&new_value->it_value, &knew_value.it_value); - __nolibc_timespec_user_to_kernel(&new_value->it_interval, &knew_value.it_interval); - ret = my_syscall4(__NR_timerfd_settime64, fd, flags, &knew_value, &kold_value); - if (old_value) { - __nolibc_timespec_kernel_to_user(&kold_value.it_interval, &old_value->it_interval); - __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); - } - return ret; + return my_syscall4(__NR_timerfd_settime64, fd, flags, new_value, old_value); #else return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); #endif diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index 45df9b09d7b6..ab67f209c99f 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -18,20 +18,6 @@ #include #include -static __inline__ -void __nolibc_timespec_user_to_kernel(const struct timespec *ts, struct __kernel_timespec *kts) -{ - kts->tv_sec = ts->tv_sec; - kts->tv_nsec = ts->tv_nsec; -} - -static __inline__ -void __nolibc_timespec_kernel_to_user(const struct __kernel_timespec *kts, struct timespec *ts) -{ - ts->tv_sec = kts->tv_sec; - ts->tv_nsec = kts->tv_nsec; -} - /* * int clock_getres(clockid_t clockid, struct timespec *res); * int clock_gettime(clockid_t clockid, struct timespec *tp); @@ -44,13 +30,7 @@ static __attribute__((unused)) int sys_clock_getres(clockid_t clockid, struct timespec *res) { #if defined(__NR_clock_getres_time64) - struct __kernel_timespec kres; - int ret; - - ret = my_syscall2(__NR_clock_getres_time64, clockid, &kres); - if (res) - __nolibc_timespec_kernel_to_user(&kres, res); - return ret; + return my_syscall2(__NR_clock_getres_time64, clockid, res); #else return my_syscall2(__NR_clock_getres, clockid, res); #endif @@ -66,13 +46,7 @@ static __attribute__((unused)) int sys_clock_gettime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_gettime64) - struct __kernel_timespec ktp; - int ret; - - ret = my_syscall2(__NR_clock_gettime64, clockid, &ktp); - if (tp) - __nolibc_timespec_kernel_to_user(&ktp, tp); - return ret; + return my_syscall2(__NR_clock_gettime64, clockid, tp); #else return my_syscall2(__NR_clock_gettime, clockid, tp); #endif @@ -88,10 +62,7 @@ static __attribute__((unused)) int sys_clock_settime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_settime64) - struct __kernel_timespec ktp; - - __nolibc_timespec_user_to_kernel(tp, &ktp); - return my_syscall2(__NR_clock_settime64, clockid, &ktp); + return my_syscall2(__NR_clock_settime64, clockid, tp); #else return my_syscall2(__NR_clock_settime, clockid, tp); #endif @@ -108,14 +79,7 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt struct timespec *rmtp) { #if defined(__NR_clock_nanosleep_time64) - struct __kernel_timespec krqtp, krmtp; - int ret; - - __nolibc_timespec_user_to_kernel(rqtp, &krqtp); - ret = my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, &krqtp, &krmtp); - if (rmtp) - __nolibc_timespec_kernel_to_user(&krmtp, rmtp); - return ret; + return my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, rqtp, rmtp); #else return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); #endif @@ -190,13 +154,7 @@ static __attribute__((unused)) int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) { #if defined(__NR_timer_gettime64) - struct __kernel_itimerspec kcurr_value; - int ret; - - ret = my_syscall2(__NR_timer_gettime64, timerid, &kcurr_value); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); - return ret; + return my_syscall2(__NR_timer_gettime64, timerid, curr_value); #else return my_syscall2(__NR_timer_gettime, timerid, curr_value); #endif @@ -213,17 +171,7 @@ int sys_timer_settime(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { #if defined(__NR_timer_settime64) - struct __kernel_itimerspec knew_value, kold_value; - int ret; - - __nolibc_timespec_user_to_kernel(&new_value->it_value, &knew_value.it_value); - __nolibc_timespec_user_to_kernel(&new_value->it_interval, &knew_value.it_interval); - ret = my_syscall4(__NR_timer_settime64, timerid, flags, &knew_value, &kold_value); - if (old_value) { - __nolibc_timespec_kernel_to_user(&kold_value.it_interval, &old_value->it_interval); - __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); - } - return ret; + return my_syscall4(__NR_timer_settime64, timerid, flags, new_value, old_value); #else return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); #endif -- cgit v1.2.3 From dd6659efe0529e7177e9270a0fc044a0b17deb8a Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:56 +0100 Subject: tools/nolibc: add compiler version detection macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some upcoming logic needs to depend on the version of GCC or clang. Add some helper macros to keep the conditionals readable. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-12-c662992f75d7@weissschuh.net --- tools/include/nolibc/compiler.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index 87090bbc53e0..c9ffd0496dae 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -47,4 +47,20 @@ # define __nolibc_fallthrough do { } while (0) #endif /* __nolibc_has_attribute(fallthrough) */ +#define __nolibc_version(_major, _minor, _patch) ((_major) * 10000 + (_minor) * 100 + (_patch)) + +#ifdef __GNUC__ +# define __nolibc_gnuc_version \ + __nolibc_version(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) +#else +# define __nolibc_gnuc_version 0 +#endif /* __GNUC__ */ + +#ifdef __clang__ +# define __nolibc_clang_version \ + __nolibc_version(__clang_major__, __clang_minor__, __clang_patchlevel__) +#else +# define __nolibc_clang_version 0 +#endif /* __clang__ */ + #endif /* _NOLIBC_COMPILER_H */ -- cgit v1.2.3 From 37219aa5b12326cd60f4586779c687f3394e80f5 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:57 +0100 Subject: tools/nolibc: add __nolibc_static_assert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a wrapper for _Static_assert() to use within nolibc. While _Static_assert() itself was only standardized in C11, in GCC and clang dialects it is also available in older standards. Link: https://lore.kernel.org/lkml/20251203192330.GA12995@1wt.eu/ Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-13-c662992f75d7@weissschuh.net --- tools/include/nolibc/compiler.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index c9ffd0496dae..a8c7619dcdde 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -63,4 +63,12 @@ # define __nolibc_clang_version 0 #endif /* __clang__ */ +#if __STDC_VERSION__ >= 201112L || \ + __nolibc_gnuc_version >= __nolibc_version(4, 6, 0) || \ + __nolibc_clang_version >= __nolibc_version(3, 0, 0) +# define __nolibc_static_assert(_t) _Static_assert(_t, "") +#else +# define __nolibc_static_assert(_t) +#endif + #endif /* _NOLIBC_COMPILER_H */ -- cgit v1.2.3 From f3ed932644a671038b31f7f536a066eeef6803b0 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:58 +0100 Subject: selftests/nolibc: add static assertions around time types handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nolibc system call wrappers expect the libc types to be compatible to the kernel types. Make sure these expectations hold at compile-time. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-14-c662992f75d7@weissschuh.net --- tools/include/nolibc/sys/timerfd.h | 4 ++++ tools/include/nolibc/time.h | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'tools') diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h index 616fcfb416a9..29fd92bd47d2 100644 --- a/tools/include/nolibc/sys/timerfd.h +++ b/tools/include/nolibc/sys/timerfd.h @@ -33,8 +33,10 @@ static __attribute__((unused)) int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) { #if defined(__NR_timerfd_gettime64) + __nolibc_assert_time64_type(curr_value->it_value.tv_sec); return my_syscall2(__NR_timerfd_gettime64, fd, curr_value); #else + __nolibc_assert_native_time64(); return my_syscall2(__NR_timerfd_gettime, fd, curr_value); #endif } @@ -51,8 +53,10 @@ int sys_timerfd_settime(int fd, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { #if defined(__NR_timerfd_settime64) + __nolibc_assert_time64_type(new_value->it_value.tv_sec); return my_syscall4(__NR_timerfd_settime64, fd, flags, new_value, old_value); #else + __nolibc_assert_native_time64(); return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index ab67f209c99f..f9257d6a7878 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -18,6 +18,12 @@ #include #include +#define __nolibc_assert_time64_type(t) \ + __nolibc_static_assert(sizeof(t) == 8) + +#define __nolibc_assert_native_time64() \ + __nolibc_assert_time64_type(__kernel_old_time_t) + /* * int clock_getres(clockid_t clockid, struct timespec *res); * int clock_gettime(clockid_t clockid, struct timespec *tp); @@ -30,8 +36,10 @@ static __attribute__((unused)) int sys_clock_getres(clockid_t clockid, struct timespec *res) { #if defined(__NR_clock_getres_time64) + __nolibc_assert_time64_type(res->tv_sec); return my_syscall2(__NR_clock_getres_time64, clockid, res); #else + __nolibc_assert_native_time64(); return my_syscall2(__NR_clock_getres, clockid, res); #endif } @@ -46,8 +54,10 @@ static __attribute__((unused)) int sys_clock_gettime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_gettime64) + __nolibc_assert_time64_type(tp->tv_sec); return my_syscall2(__NR_clock_gettime64, clockid, tp); #else + __nolibc_assert_native_time64(); return my_syscall2(__NR_clock_gettime, clockid, tp); #endif } @@ -62,8 +72,10 @@ static __attribute__((unused)) int sys_clock_settime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_settime64) + __nolibc_assert_time64_type(tp->tv_sec); return my_syscall2(__NR_clock_settime64, clockid, tp); #else + __nolibc_assert_native_time64(); return my_syscall2(__NR_clock_settime, clockid, tp); #endif } @@ -79,8 +91,10 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt struct timespec *rmtp) { #if defined(__NR_clock_nanosleep_time64) + __nolibc_assert_time64_type(rqtp->tv_sec); return my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, rqtp, rmtp); #else + __nolibc_assert_native_time64(); return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); #endif } @@ -154,8 +168,10 @@ static __attribute__((unused)) int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) { #if defined(__NR_timer_gettime64) + __nolibc_assert_time64_type(curr_value->it_value.tv_sec); return my_syscall2(__NR_timer_gettime64, timerid, curr_value); #else + __nolibc_assert_native_time64(); return my_syscall2(__NR_timer_gettime, timerid, curr_value); #endif } @@ -171,8 +187,10 @@ int sys_timer_settime(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { #if defined(__NR_timer_settime64) + __nolibc_assert_time64_type(new_value->it_value.tv_sec); return my_syscall4(__NR_timer_settime64, timerid, flags, new_value, old_value); #else + __nolibc_assert_native_time64(); return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); #endif } -- cgit v1.2.3 From 03139924859f7930cd1667a278a560b5d4c6a672 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 4 Jan 2026 15:57:51 +0100 Subject: selftests/nolibc: drop NOLIBC_SYSROOT=0 logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This logic was added in commit 850fad7de827 ("selftests/nolibc: allow test -include /path/to/nolibc.h") to allow the testing of -include /path/to/nolibc.h. As it requires as special variable to activate, this code is nearly never used. Furthermore it complicates the logic a bit. Since commit a6a054c8ad32 ("tools/nolibc: add target to check header usability") and commit 443c6467fcd6 ("selftests/nolibc: always run nolibc header check") the usability of -include /path/to/nolibc.h is always checked anyways, making NOLIBC_SYSROOT=0 pointless. Drop the special logic. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260104-nolibc-nolibc_sysroot-v1-1-98025ad99add@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 6 ------ 1 file changed, 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index f9d43cbdc894..b17ba2f8fb46 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -302,15 +302,9 @@ sysroot/$(ARCH)/include: $(Q)$(MAKE) -C $(srctree)/tools/include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone headers_check $(Q)mv sysroot/sysroot sysroot/$(ARCH) -ifneq ($(NOLIBC_SYSROOT),0) nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC) -else -nolibc-test: nolibc-test.c nolibc-test-linkage.c - $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC) -endif libc-test: nolibc-test.c nolibc-test-linkage.c $(QUIET_CC)$(HOSTCC) -o $@ nolibc-test.c nolibc-test-linkage.c -- cgit v1.2.3 From 57624b38ce99b906cbb191a1d536bb871ad2d8c2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 4 Jan 2026 23:43:13 +0100 Subject: tools/nolibc: align sys_vfork() with sys_fork() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the generic variants of sys_fork() and sys_vfork() differ in both they precedence of used system calls and the usage of sys_clone() vs sys_clone3(). While the interface of clone3() in sys_vfork() is more consistent over different architectures, qemu-user does not support it, making testing harder. We already handle the different clone() interfaces for sys_fork() in the architecture-specific headers, and can do so also for sys_vfork(). In fact SPARC already has such handling and only s390 is currently missing. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260104-nolibc-vfork-v1-1-a63464b9e4e6@weissschuh.net --- tools/include/nolibc/arch-s390.h | 8 ++++++++ tools/include/nolibc/sys.h | 18 +++++------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index 5bee6ecbde0a..904281e95f99 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -8,6 +8,7 @@ #include "types.h" +#include #include #include @@ -189,4 +190,11 @@ pid_t sys_fork(void) } #define sys_fork sys_fork +static __attribute__((unused)) +pid_t sys_vfork(void) +{ + return my_syscall5(__NR_clone, 0, CLONE_VM | CLONE_VFORK | SIGCHLD, 0, 0, 0); +} +#define sys_vfork sys_vfork + #endif /* _NOLIBC_ARCH_S390_H */ diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 847af1ccbdc9..403ee9ce8389 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -22,7 +22,7 @@ #include #include #include /* for O_* and AT_* */ -#include /* for clone_args */ +#include /* for CLONE_* */ #include /* for statx() */ #include "errno.h" @@ -363,19 +363,11 @@ pid_t fork(void) static __attribute__((unused)) pid_t sys_vfork(void) { -#if defined(__NR_vfork) +#if defined(__NR_clone) + /* See the note in sys_fork(). */ + return my_syscall5(__NR_clone, CLONE_VM | CLONE_VFORK | SIGCHLD, 0, 0, 0, 0); +#elif defined(__NR_vfork) return my_syscall0(__NR_vfork); -#else - /* - * clone() could be used but has different argument orders per - * architecture. - */ - struct clone_args args = { - .flags = CLONE_VM | CLONE_VFORK, - .exit_signal = SIGCHLD, - }; - - return my_syscall2(__NR_clone3, &args, sizeof(args)); #endif } #endif -- cgit v1.2.3 From 6b6dbf3e4ecfd7d1086bd7cd8b31ca8e45d4dc1f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 11:39:55 +0100 Subject: selftests/nolibc: always build sparc32 tests with -mcpu=v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since LLVM commit 39e30508a7f6 ("[Driver][Sparc] Default to -mcpu=v9 for 32-bit Linux/sparc64 (#109278)"), clang defaults to -mcpu=v9 for 32-bit SPARC builds. -mcpu=v9 generates instructions which are not recognized by qemu-sparc and qemu-system-sparc. Explicitly enforce -mcpu=v8 to generate compatible code. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-sparc32-fix-v2-1-7c5cd6b175c2@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index b17ba2f8fb46..f5704193038f 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -226,7 +226,7 @@ CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6 CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6 CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2 CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld) -CFLAGS_sparc32 = $(call cc-option,-m32) +CFLAGS_sparc32 = $(call cc-option,-m32) -mcpu=v8 CFLAGS_sh4 = -ml -m4 ifeq ($(origin XARCH),command line) CFLAGS_XARCH = $(CFLAGS_$(XARCH)) -- cgit v1.2.3 From 2246c24426fbc1069cb2a47e0624ccffe5f2627b Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Wed, 31 Dec 2025 14:42:28 -0800 Subject: perf pmu: Relax uncore wildcard matching to allow numeric suffix Diamond Rapids introduces two types of PCIe related uncore PMUs: "uncore_pcie4_*" and "uncore_pcie6_*". To ensure that generic PCIe events (e.g., UNC_PCIE_CLOCKTICKS) can match and collect events from both PMU types, slightly relax the wildcard matching logic in perf_pmu__match_wildcard(). This change allows a wildcard such as "pcie" to match PMU names that include a numeric suffix, such as "pcie4_*" and "pcie6_*". Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251231224233.113839-12-zide.chen@intel.com --- tools/perf/util/pmu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 956ea273c2c7..01a21b6aa031 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -939,6 +939,7 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok) { const char *p, *suffix; bool has_hex = false; + bool has_underscore = false; size_t tok_len = strlen(tok); /* Check start of pmu_name for equality. */ @@ -949,13 +950,14 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok) if (*p == 0) return true; - if (*p == '_') { - ++p; - ++suffix; - } - - /* Ensure we end in a number */ + /* Ensure we end in a number or a mix of number and "_". */ while (1) { + if (!has_underscore && (*p == '_')) { + has_underscore = true; + ++p; + ++suffix; + } + if (!isxdigit(*p)) return false; if (!has_hex) -- cgit v1.2.3 From ab86d0bf01f6d0e37fd67761bb62918321b64efc Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 5 Jan 2026 12:47:46 +0100 Subject: selftests/bpf: Update xdp_context_test_run test to check maximum metadata size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the selftest to check that the metadata size check takes the xdp_frame size into account in bpf_prog_test_run. The original check (for meta size 256) was broken because the data frame supplied was smaller than this, triggering a different EINVAL return. So supply a larger data frame for this test to make sure we actually exercise the check we think we are. Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Amery Hung Link: https://lore.kernel.org/r/20260105114747.1358750-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_context_test_run.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index ee94c281888a..26159e0499c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -47,6 +47,7 @@ void test_xdp_context_test_run(void) struct test_xdp_context_test_run *skel = NULL; char data[sizeof(pkt_v4) + sizeof(__u32)]; char bad_ctx[sizeof(struct xdp_md) + 1]; + char large_data[256]; struct xdp_md ctx_in, ctx_out; DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = &data, @@ -94,9 +95,6 @@ void test_xdp_context_test_run(void) test_xdp_context_error(prog_fd, opts, 4, sizeof(__u32), sizeof(data), 0, 0, 0); - /* Meta data must be 255 bytes or smaller */ - test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0); - /* Total size of data must be data_end - data_meta or larger */ test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data) + 1, 0, 0, 0); @@ -116,6 +114,16 @@ void test_xdp_context_test_run(void) test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), 0, 0, 1); + /* Meta data must be 216 bytes or smaller (256 - sizeof(struct + * xdp_frame)). Test both nearest invalid size and nearest invalid + * 4-byte-aligned size, and make sure data_in is large enough that we + * actually hit the check on metadata length + */ + opts.data_in = large_data; + opts.data_size_in = sizeof(large_data); + test_xdp_context_error(prog_fd, opts, 0, 217, sizeof(large_data), 0, 0, 0); + test_xdp_context_error(prog_fd, opts, 0, 220, sizeof(large_data), 0, 0, 0); + test_xdp_context_test_run__destroy(skel); } -- cgit v1.2.3 From e970637707f4f8e5bd098b09090b755f2f57898b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 2 Jan 2026 12:06:57 -0800 Subject: docs: find-unused-docs.sh: fixup directory usage The recent move of this script from scripts/ to tools/docs/ did not account for the 'cd' directory usage. Update "cd .." to "cd ../.." to make the script self-correcting. This also eliminates a shell warning: ./tools/docs/find-unused-docs.sh: line 33: cd: Documentation/: No such file or directory Fixes: 184414c6a6ca ("docs: move find-unused-docs.sh to tools/docs") Signed-off-by: Randy Dunlap Fixes: 184414c6a6ca (docs: move find-unused-docs.sh to tools/docs) Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Jonathan Corbet Message-ID: <20260102200657.1040234-1-rdunlap@infradead.org> --- tools/docs/find-unused-docs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh index 05552dbda5bc..ca4e607ec3f7 100755 --- a/tools/docs/find-unused-docs.sh +++ b/tools/docs/find-unused-docs.sh @@ -28,7 +28,7 @@ if ! [ -d "$1" ]; then fi cd "$( dirname "${BASH_SOURCE[0]}" )" -cd .. +cd ../.. cd Documentation/ -- cgit v1.2.3 From b04d2b9199129f4f0c992a518c0fb78c2efc1064 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 10 Dec 2025 08:17:52 +0100 Subject: perf test: Fix test case perf evlist tests for s390x Perf test case 78: perf evlist tests fails on s390. The failure is causes by grouping events cycles and instructions because sampling does only support event cycles. Change the group to software events to fix this. Output before: # ./perf test 78 78: perf evlist tests : FAILED! # Output after: # ./perf test 78 78: perf evlist tests : Ok # Fixes: db452961de939225 ("perf tests evlist: Add basic evlist test") Signed-off-by: Thomas Richter Tested-by: Ian Rogers Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Jan Polensky Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Vasily Gorbik Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/evlist.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/evlist.sh b/tools/perf/tests/shell/evlist.sh index 140f099e75c1..5632be391710 100755 --- a/tools/perf/tests/shell/evlist.sh +++ b/tools/perf/tests/shell/evlist.sh @@ -38,13 +38,14 @@ test_evlist_simple() { test_evlist_group() { echo "Group evlist test" - if ! perf record -e "{cycles,instructions}" -o "${perfdata}" true 2> /dev/null + if ! perf record -e "{cpu-clock,task-clock}" -o "${perfdata}" \ + -- perf test -w noploop 2> /dev/null then echo "Group evlist [Skipped event group recording failed]" return fi - if ! perf evlist -i "${perfdata}" -g | grep -q "{.*cycles.*,.*instructions.*}" + if ! perf evlist -i "${perfdata}" -g | grep -q "{.*cpu-clock.*,.*task-clock.*}" then echo "Group evlist [Failed to list event group]" err=1 -- cgit v1.2.3 From 1ec205e3669c12dfb0adbd2d7099922c195b46ff Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 5 Dec 2025 11:01:35 -0800 Subject: perf test java symbol: Additional libperf-jvmti.so path check If perf is built into an output directory then so is libperf-jvmti.so. If `perf test` is run from that directory then PWD needn't also be that directory meaning libperf-jvmti.so won't be found and the test skipped. Add an additional check for libperf-jvmti.so in the same directory as the perf binary for this case, this avoids the test skipping. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_java_symbol.sh | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/test_java_symbol.sh b/tools/perf/tests/shell/test_java_symbol.sh index 499539d1c479..63a2cc9bf13f 100755 --- a/tools/perf/tests/shell/test_java_symbol.sh +++ b/tools/perf/tests/shell/test_java_symbol.sh @@ -22,10 +22,13 @@ cleanup_files() trap cleanup_files exit term int +PERF_DIR=$(dirname "$(which perf)") if [ -e "$PWD/tools/perf/libperf-jvmti.so" ]; then LIBJVMTI=$PWD/tools/perf/libperf-jvmti.so elif [ -e "$PWD/libperf-jvmti.so" ]; then LIBJVMTI=$PWD/libperf-jvmti.so +elif [ -e "$PERF_DIR/libperf-jvmti.so" ]; then + LIBJVMTI=$PERF_DIR/libperf-jvmti.so elif [ -e "$PREFIX/lib64/libperf-jvmti.so" ]; then LIBJVMTI=$PREFIX/lib64/libperf-jvmti.so elif [ -e "$PREFIX/lib/libperf-jvmti.so" ]; then @@ -34,6 +37,7 @@ elif [ -e "/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-gen LIBJVMTI=/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-generic//')/libperf-jvmti.so else echo "Fail to find libperf-jvmti.so" + # JVMTI is a build option, skip the test if fail to find lib exit 2 fi -- cgit v1.2.3 From 7fc37b588aaaf72145764b4c3b6184431471b3e0 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 23 Dec 2025 17:00:24 +0000 Subject: perf build: Remove FEATURE_CHECK_LDFLAGS-disassembler-{four-args,init-styled} setting As the building mechanism is now able to retry detection with different combinations of linking flags, setting FEATURE_CHECK_LDFLAGS-disassembler-four-args and FEATURE_CHECK_LDFLAGS-disassembler-init-styled is not necessary anymore, so remove it. James Clark notes: Use the same technique to find the set of bfd-related libraries to link as in: 3308ffc5016e6136 ("tools, build: Retry detection of bfd-related features") Signed-off-by: Roberto Sassu Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andres Freund Cc: Andrii Nakryiko Cc: Bill Wendling Cc: Daniel Borkmann Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Fastabend Cc: Justin Stitt Cc: KP Singh Cc: Leo Yan Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Nick Terrell Cc: Peter Zijlstra Cc: Quentin Monnet Cc: Song Liu Cc: Stanislav Fomichev Signed-off-by: James Clark Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index d8d25f62aaad..bbfebbe26f4d 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -379,8 +379,8 @@ ifneq ($(TCMALLOC),) endif ifeq ($(FEATURES_DUMP),) -# We will display at the end of this Makefile.config, using $(call feature_display_entries) -# As we may retry some feature detection here, see the disassembler-four-args case, for instance +# We will display at the end of this Makefile.config, using $(call feature_display_entries), +# as we may retry some feature detection here. FEATURE_DISPLAY_DEFERRED := 1 include $(srctree)/tools/build/Makefile.feature else @@ -935,8 +935,6 @@ ifdef BUILD_NONDISTRO ifeq ($(feature-libbfd), 1) EXTLIBS += -lbfd -lopcodes - FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl - FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl else # we are on a system that requires -liberty and (maybe) -lz # to link against -lbfd; test each case individually here @@ -948,13 +946,9 @@ ifdef BUILD_NONDISTRO ifeq ($(feature-libbfd-liberty), 1) EXTLIBS += -lbfd -lopcodes -liberty - FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -ldl - FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -ldl else ifeq ($(feature-libbfd-liberty-z), 1) EXTLIBS += -lbfd -lopcodes -liberty -lz - FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -lz -ldl - FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -lz -ldl endif endif $(call feature_check,disassembler-four-args) @@ -1332,6 +1326,6 @@ endif # re-generate FEATURE-DUMP as we may have called feature_check, found out # extra libraries to add to LDFLAGS of some other test and then redo those -# tests, see the block about libbfd, disassembler-four-args, for instance. +# tests. $(shell rm -f $(FEATURE_DUMP_FILENAME)) $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME))) -- cgit v1.2.3 From ae323bc241d25f5ebc56f0b2a6d580b7233647c0 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 23 Dec 2025 17:00:25 +0000 Subject: perf build: Do all non-distro feature checks in one go None of the if statements or variable assignments in the non-distro block actually affect the feature checks. Just do them all in one place so the flow isn't obscured. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Bill Wendling Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Justin Stitt Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index bbfebbe26f4d..85075de2aedd 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -931,36 +931,28 @@ ifneq ($(NO_JEVENTS),1) endif ifdef BUILD_NONDISTRO + # call all detections now so we get correct status in VF output $(call feature_check,libbfd) + $(call feature_check,disassembler-four-args) + $(call feature_check,disassembler-init-styled) + $(call feature_check,libbfd-buildid) + $(call feature_check,libbfd-liberty) + $(call feature_check,libbfd-liberty-z) + # we may be on a system that requires -liberty and (maybe) -lz + # to link against -lbfd; test each case individually here ifeq ($(feature-libbfd), 1) EXTLIBS += -lbfd -lopcodes - else - # we are on a system that requires -liberty and (maybe) -lz - # to link against -lbfd; test each case individually here - - # call all detections now so we get correct - # status in VF output - $(call feature_check,libbfd-liberty) - $(call feature_check,libbfd-liberty-z) - - ifeq ($(feature-libbfd-liberty), 1) - EXTLIBS += -lbfd -lopcodes -liberty - else - ifeq ($(feature-libbfd-liberty-z), 1) - EXTLIBS += -lbfd -lopcodes -liberty -lz - endif - endif - $(call feature_check,disassembler-four-args) - $(call feature_check,disassembler-init-styled) + else ifeq ($(feature-libbfd-liberty), 1) + EXTLIBS += -lbfd -lopcodes -liberty + else ifeq ($(feature-libbfd-liberty-z), 1) + EXTLIBS += -lbfd -lopcodes -liberty -lz endif CFLAGS += -DHAVE_LIBBFD_SUPPORT CXXFLAGS += -DHAVE_LIBBFD_SUPPORT $(call detected,CONFIG_LIBBFD) - $(call feature_check,libbfd-buildid) - ifeq ($(feature-libbfd-buildid), 1) CFLAGS += -DHAVE_LIBBFD_BUILDID_SUPPORT else -- cgit v1.2.3 From c0cb97a275ffa00d91a0715dce8105ae3f627727 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 23 Dec 2025 17:00:26 +0000 Subject: perf build: Remove unused libbfd-buildid feature test HAVE_LIBBFD_BUILDID_SUPPORT isn't used in the codebase so remove the feature test that sets it. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Bill Wendling Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Justin Stitt Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 1 - tools/build/feature/Makefile | 4 ---- tools/build/feature/test-libbfd-buildid.c | 8 -------- tools/perf/Makefile.config | 7 ------- 4 files changed, 20 deletions(-) delete mode 100644 tools/build/feature/test-libbfd-buildid.c (limited to 'tools') diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 362cf8f4a0a0..bbaa88bb9b30 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -71,7 +71,6 @@ FEATURE_TESTS_BASIC := \ gettid \ glibc \ libbfd \ - libbfd-buildid \ libelf \ libelf-getphdrnum \ libelf-gelf_getnote \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 0d5a15654b17..d84db7df7988 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -13,7 +13,6 @@ FILES= \ test-gtk2-infobar.bin \ test-hello.bin \ test-libbfd.bin \ - test-libbfd-buildid.bin \ test-disassembler-four-args.bin \ test-disassembler-init-styled.bin \ test-reallocarray.bin \ @@ -268,9 +267,6 @@ $(OUTPUT)test-libpython.bin: $(OUTPUT)test-libbfd.bin: $(BUILD_BFD) -$(OUTPUT)test-libbfd-buildid.bin: - $(BUILD_BFD) || $(BUILD_BFD) -liberty || $(BUILD_BFD) -liberty -lz - $(OUTPUT)test-disassembler-four-args.bin: $(BUILD_BFD) -lopcodes || $(BUILD_BFD) -lopcodes -liberty || \ $(BUILD_BFD) -lopcodes -liberty -lz diff --git a/tools/build/feature/test-libbfd-buildid.c b/tools/build/feature/test-libbfd-buildid.c deleted file mode 100644 index 157644b04c05..000000000000 --- a/tools/build/feature/test-libbfd-buildid.c +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include - -int main(void) -{ - bfd *abfd = bfd_openr("Pedro", 0); - return abfd && (!abfd->build_id || abfd->build_id->size > 0x506564726f); -} diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 85075de2aedd..fb1cf2bf5d83 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -935,7 +935,6 @@ ifdef BUILD_NONDISTRO $(call feature_check,libbfd) $(call feature_check,disassembler-four-args) $(call feature_check,disassembler-init-styled) - $(call feature_check,libbfd-buildid) $(call feature_check,libbfd-liberty) $(call feature_check,libbfd-liberty-z) @@ -953,12 +952,6 @@ ifdef BUILD_NONDISTRO CXXFLAGS += -DHAVE_LIBBFD_SUPPORT $(call detected,CONFIG_LIBBFD) - ifeq ($(feature-libbfd-buildid), 1) - CFLAGS += -DHAVE_LIBBFD_BUILDID_SUPPORT - else - $(warning Old version of libbfd/binutils things like PE executable profiling will not be available) - endif - ifeq ($(feature-disassembler-four-args), 1) CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE endif -- cgit v1.2.3 From cff602f65988da48cc1b84f6c3588a25a320fa81 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 23 Dec 2025 17:00:27 +0000 Subject: perf build: Feature test for libbfd thread safety API The non-distro build requires libbfd 2.42 since commit b72b8132d8fd ("perf libbfd: Ensure libbfd is initialized prior to use"). Add a feature test so that it's obvious why the build fails if this criteria isn't met. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Bill Wendling Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Justin Stitt Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 1 + tools/build/feature/Makefile | 4 ++++ tools/build/feature/test-libbfd-threadsafe.c | 18 ++++++++++++++++++ tools/perf/Makefile.config | 5 +++++ 4 files changed, 28 insertions(+) create mode 100644 tools/build/feature/test-libbfd-threadsafe.c (limited to 'tools') diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index bbaa88bb9b30..7f119eafc7c4 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -71,6 +71,7 @@ FEATURE_TESTS_BASIC := \ gettid \ glibc \ libbfd \ + libbfd-threadsafe \ libelf \ libelf-getphdrnum \ libelf-gelf_getnote \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index d84db7df7988..5c15572d505e 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -13,6 +13,7 @@ FILES= \ test-gtk2-infobar.bin \ test-hello.bin \ test-libbfd.bin \ + test-libbfd-threadsafe.bin \ test-disassembler-four-args.bin \ test-disassembler-init-styled.bin \ test-reallocarray.bin \ @@ -267,6 +268,9 @@ $(OUTPUT)test-libpython.bin: $(OUTPUT)test-libbfd.bin: $(BUILD_BFD) +$(OUTPUT)test-libbfd-threadsafe.bin: + $(BUILD_BFD) || $(BUILD_BFD) -liberty || $(BUILD_BFD) -liberty -lz + $(OUTPUT)test-disassembler-four-args.bin: $(BUILD_BFD) -lopcodes || $(BUILD_BFD) -lopcodes -liberty || \ $(BUILD_BFD) -lopcodes -liberty -lz diff --git a/tools/build/feature/test-libbfd-threadsafe.c b/tools/build/feature/test-libbfd-threadsafe.c new file mode 100644 index 000000000000..fe97f95f6f06 --- /dev/null +++ b/tools/build/feature/test-libbfd-threadsafe.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +static bool lock(void *unused) +{ + return true; +} + +static bool unlock(void *unused) +{ + return true; +} + +int main(void) +{ + /* Check for presence of new thread safety API (version 2.42) */ + return !bfd_thread_init(lock, unlock, NULL); +} diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index fb1cf2bf5d83..6f2c7bd36e74 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -935,9 +935,14 @@ ifdef BUILD_NONDISTRO $(call feature_check,libbfd) $(call feature_check,disassembler-four-args) $(call feature_check,disassembler-init-styled) + $(call feature_check,libbfd-threadsafe) $(call feature_check,libbfd-liberty) $(call feature_check,libbfd-liberty-z) + ifneq ($(feature-libbfd-threadsafe), 1) + $(error binutils 2.42 or later is required for non-distro builds) + endif + # we may be on a system that requires -liberty and (maybe) -lz # to link against -lbfd; test each case individually here ifeq ($(feature-libbfd), 1) -- cgit v1.2.3 From 523471c5163659c61132274123c5470286e407ce Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 23 Dec 2025 17:00:28 +0000 Subject: perf build: Skip nondistro build test if libbfd is old Non distro builds now require a new version of libbfd, so skip the test if the library is too old. The grep test isn't a strong as the feature test in test-libbfd-threadsafe.c, but there seems to be precedent for feature testing this way here and it's good enough for the build-test rule. If the function exists but returns an error it will be picked up by the feature test when attempting the build. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Bill Wendling Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Justin Stitt Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 6641701e4828..36411b4b6d2b 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -124,6 +124,9 @@ make_minimal += NO_LIBDW_DWARF_UNWIND=1 NO_LIBBPF=1 make_minimal += NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1 make_minimal += NO_LIBCAP=1 NO_CAPSTONE=1 +# binutils 2_42 and newer have bfd_thread_init() +new_libbfd := $(shell echo '#include ' | $(CC) -E -x c - | grep bfd_thread_init) + # $(run) contains all available tests run := make_pure # Targets 'clean all' can be run together only through top level @@ -137,7 +140,9 @@ MAKE_F := $(MAKE) -f $(MK) endif run += make_python_perf_so run += make_debug +ifneq ($(new_libbfd),) run += make_nondistro +endif run += make_extra_tests run += make_jevents_all run += make_no_bpf_skel -- cgit v1.2.3 From 8e746e95c3e4eb56ae261feb9ae261bce1f96947 Mon Sep 17 00:00:00 2001 From: Derek Foreman Date: Fri, 28 Nov 2025 15:50:17 -0600 Subject: perf data: Allow filtering conversion by time range This adds a feature to allow restricting the range of converted samples with a range string like perf-script and perf-report --time. Committer testing: Put a probe on the ICMP receive path handling broadcast packets: # perf probe icmp_rcv:64 Added new event: probe:icmp_rcv_L64 (on icmp_rcv:64) You can now use it in all perf tools, such as: perf record -e probe:icmp_rcv_L64 -aR sleep 1 # perf record -e probe:icmp_rcv_L64 ping -c 10 -b 127.255.255.255 WARNING: pinging broadcast address PING 127.255.255.255 (127.255.255.255) 56(84) bytes of data. ^C --- 127.255.255.255 ping statistics --- 10 packets transmitted, 0 received, 100% packet loss, time 9217ms [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.034 MB perf.data (10 samples) ] # perf script ping 52785 [009] 5847.300394: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5848.325018: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5849.349007: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5850.372979: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5851.396988: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5852.420954: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5853.444934: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5854.468926: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5855.492914: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5856.516883: probe:icmp_rcv_L64: (ffffffffaadb337e) # Now get some slices using perf script: # perf script --time 40% ping 52785 [009] 5847.300394: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5848.325018: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5849.349007: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5850.372979: probe:icmp_rcv_L64: (ffffffffaadb337e) # perf script --time 40%-60% ping 52785 [009] 5851.396988: probe:icmp_rcv_L64: (ffffffffaadb337e) ping 52785 [009] 5852.420954: probe:icmp_rcv_L64: (ffffffffaadb337e) # And finally use this new feature: # perf data convert --to-json out.json --time 0%-10% [ perf data convert: Converted 'perf.data' into JSON data 'out.json' ] [ perf data convert: Converted and wrote 0.001 MB (1 samples) ] [ perf data convert: Skipped 9 samples ] # cat out.json { "linux-perf-json-version": 1, "headers": { "header-version": 1, "captured-on": "2026-01-06T22:26:40Z", "data-offset": 520, "data-size": 34648, "feat-offset": 35168, "hostname": "number", "os-release": "6.17.12-300.fc43.x86_64", "arch": "x86_64", "cpu-desc": "AMD Ryzen 9 9950X3D 16-Core Processor", "cpuid": "AuthenticAMD,26,68,0", "nrcpus-online": 32, "nrcpus-avail": 32, "perf-version": "6.19.rc4.gf4c270685d3d", "cmdline": [ "/home/acme/bin/perf" ] }, "samples": [ { "timestamp": 5847300394661, "pid": 52785, "tid": 52785, "cpu": 9, "comm": "ping", "callchain": [ { "ip": "0xffffffffaadb337f", "symbol": "icmp_rcv", "dso": "[kernel.kallsyms]" } ], "__probe_ip": "ffffffffaadb337e" } ] } # Signed-off-by: Derek Foreman Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-data.txt | 28 ++++++++++++++++++++++++++++ tools/perf/builtin-data.c | 3 +++ tools/perf/util/data-convert-bt.c | 31 +++++++++++++++++++++++++++++++ tools/perf/util/data-convert-json.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/data-convert.h | 1 + 5 files changed, 96 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt index 417bf17e265c..20f178d61ed7 100644 --- a/tools/perf/Documentation/perf-data.txt +++ b/tools/perf/Documentation/perf-data.txt @@ -40,6 +40,34 @@ OPTIONS for 'convert' --force:: Don't complain, do it. +--time:: + Only convert samples within given time window: ,. Times + have the format seconds.nanoseconds. If start is not given (i.e. time + string is ',x.y') then analysis starts at the beginning of the file. If + stop time is not given (i.e. time string is 'x.y,') then analysis goes + to end of file. Multiple ranges can be separated by spaces, which + requires the argument to be quoted e.g. --time "1234.567,1234.789 1235," + + Also support time percent with multiple time ranges. Time string is + 'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. + + For example: + Select the second 10% time slice: + + perf data convert --to-json out.json --time 10%/2 + + Select from 0% to 10% time slice: + + perf data convert --to-json out.json --time 0%-10% + + Select the first and second 10% time slices: + + perf data convert --to-json out.json --time 10%/1,10%/2 + + Select from 0% to 10% and 30% to 40% slices: + + perf data convert --to-json out.json --time 0%-10%,30%-40% + -v:: --verbose:: Be more verbose (show counter open errors, etc). diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c index ce51cbf6dc97..85f59886b5cf 100644 --- a/tools/perf/builtin-data.c +++ b/tools/perf/builtin-data.c @@ -33,6 +33,7 @@ const char *to_ctf; struct perf_data_convert_opts opts = { .force = false, .all = false, + .time_str = NULL, }; const struct option data_options[] = { @@ -45,6 +46,8 @@ const struct option data_options[] = { #endif OPT_BOOLEAN('f', "force", &opts.force, "don't complain, do it"), OPT_BOOLEAN(0, "all", &opts.all, "Convert all events"), + OPT_STRING(0, "time", &opts.time_str, "str", + "Time span of interest (start,stop)"), OPT_END() }; diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 3d2e437e1354..0bcbc0e309e0 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -34,6 +34,7 @@ #include "util.h" #include "clockid.h" #include "util/sample.h" +#include "util/time-utils.h" #ifdef HAVE_LIBTRACEEVENT #include @@ -91,9 +92,14 @@ struct convert { struct perf_tool tool; struct ctf_writer writer; + struct perf_time_interval *ptime_range; + int range_size; + int range_num; + u64 events_size; u64 events_count; u64 non_sample_count; + u64 skipped; /* Ordered events configured queue size. */ u64 queue_size; @@ -811,6 +817,11 @@ static int process_sample_event(const struct perf_tool *tool, if (WARN_ONCE(!priv, "Failed to setup all events.\n")) return 0; + if (perf_time__ranges_skip_sample(c->ptime_range, c->range_num, sample->time)) { + ++c->skipped; + return 0; + } + event_class = priv->event_class; /* update stats */ @@ -1644,6 +1655,15 @@ int bt_convert__perf2ctf(const char *input, const char *path, if (IS_ERR(session)) return PTR_ERR(session); + if (opts->time_str) { + err = perf_time__parse_for_ranges(opts->time_str, session, + &c.ptime_range, + &c.range_size, + &c.range_num); + if (err < 0) + goto free_session; + } + /* CTF writer */ if (ctf_writer__init(cw, path, session, opts->tod)) goto free_session; @@ -1687,6 +1707,14 @@ int bt_convert__perf2ctf(const char *input, const char *path, else fprintf(stderr, ", %" PRIu64 " non-samples) ]\n", c.non_sample_count); + if (c.skipped) { + fprintf(stderr, "[ perf data convert: Skipped %" PRIu64 " samples ]\n", + c.skipped); + } + + if (c.ptime_range) + zfree(&c.ptime_range); + cleanup_events(session); perf_session__delete(session); ctf_writer__cleanup(cw); @@ -1696,6 +1724,9 @@ int bt_convert__perf2ctf(const char *input, const char *path, free_writer: ctf_writer__cleanup(cw); free_session: + if (c.ptime_range) + zfree(&c.ptime_range); + perf_session__delete(session); pr_err("Error during conversion setup.\n"); return err; diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 9dc1e184cf3c..787039967916 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -25,6 +25,7 @@ #include "util/session.h" #include "util/symbol.h" #include "util/thread.h" +#include "util/time-utils.h" #include "util/tool.h" #ifdef HAVE_LIBTRACEEVENT @@ -35,7 +36,12 @@ struct convert_json { struct perf_tool tool; FILE *out; bool first; + struct perf_time_interval *ptime_range; + int range_size; + int range_num; + u64 events_count; + u64 skipped; }; // Outputs a JSON-encoded string surrounded by quotes with characters escaped. @@ -165,6 +171,11 @@ static int process_sample_event(const struct perf_tool *tool, return -1; } + if (perf_time__ranges_skip_sample(c->ptime_range, c->range_num, sample->time)) { + ++c->skipped; + return 0; + } + ++c->events_count; if (c->first) @@ -320,6 +331,10 @@ int bt_convert__perf2json(const char *input_name, const char *output_name, struct convert_json c = { .first = true, .events_count = 0, + .ptime_range = NULL, + .range_size = 0, + .range_num = 0, + .skipped = 0, }; struct perf_data data = { .mode = PERF_DATA_MODE_READ, @@ -382,6 +397,15 @@ int bt_convert__perf2json(const char *input_name, const char *output_name, goto err_session_delete; } + if (opts->time_str) { + ret = perf_time__parse_for_ranges(opts->time_str, session, + &c.ptime_range, + &c.range_size, + &c.range_num); + if (ret < 0) + goto err_session_delete; + } + // The opening brace is printed manually because it isn't delimited from a // previous value (i.e. we don't want a leading newline) fputc('{', c.out); @@ -411,7 +435,16 @@ int bt_convert__perf2json(const char *input_name, const char *output_name, "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n", (ftell(c.out)) / 1024.0 / 1024.0, c.events_count); + if (c.skipped) { + fprintf(stderr, "[ perf data convert: Skipped %" PRIu64 " samples ]\n", + c.skipped); + } + ret = 0; + + if (c.ptime_range) + zfree(&c.ptime_range); + err_session_delete: perf_session__delete(session); err_fclose: diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h index 1b4c5f598415..ee651fa680a1 100644 --- a/tools/perf/util/data-convert.h +++ b/tools/perf/util/data-convert.h @@ -8,6 +8,7 @@ struct perf_data_convert_opts { bool force; bool all; bool tod; + const char *time_str; }; #ifdef HAVE_LIBBABELTRACE_SUPPORT -- cgit v1.2.3 From 75326c67aa8c43000819a2ac29f22eb27846d545 Mon Sep 17 00:00:00 2001 From: Derek Foreman Date: Fri, 28 Nov 2025 15:50:18 -0600 Subject: perf data: Fix coding style Adjust some oddly indented fprintf() calls. Signed-off-by: Derek Foreman Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data-convert-bt.c | 6 ++---- tools/perf/util/data-convert-json.c | 9 ++++----- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 0bcbc0e309e0..a22e9049ff30 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -1693,12 +1693,10 @@ int bt_convert__perf2ctf(const char *input, const char *path, else pr_err("Error during conversion.\n"); - fprintf(stderr, - "[ perf data convert: Converted '%s' into CTF data '%s' ]\n", + fprintf(stderr, "[ perf data convert: Converted '%s' into CTF data '%s' ]\n", data.path, path); - fprintf(stderr, - "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples", + fprintf(stderr, "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples", (double) c.events_size / 1024.0 / 1024.0, c.events_count); diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 787039967916..eefa3a94c813 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -427,13 +427,12 @@ int bt_convert__perf2json(const char *input_name, const char *output_name, output_json_format(c.out, false, 0, "}"); fputc('\n', c.out); - fprintf(stderr, - "[ perf data convert: Converted '%s' into JSON data '%s' ]\n", - data.path, output_name); + fprintf(stderr, "[ perf data convert: Converted '%s' into JSON data '%s' ]\n", + data.path, output_name); fprintf(stderr, - "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n", - (ftell(c.out)) / 1024.0 / 1024.0, c.events_count); + "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n", + (ftell(c.out)) / 1024.0 / 1024.0, c.events_count); if (c.skipped) { fprintf(stderr, "[ perf data convert: Skipped %" PRIu64 " samples ]\n", -- cgit v1.2.3 From 1eb217ab2e737609f8a861b517649e82e7236d05 Mon Sep 17 00:00:00 2001 From: Faisal Bukhari Date: Mon, 22 Sep 2025 23:38:34 +0530 Subject: perf parse-events: Fix evsel allocation failure If evsel__new_idx() returns NULL, the function currently jumps to label 'out_err'. Here, references to `cpus` and `pmu_cpus` are dropped. Also, resources held by evsel->name and evsel->metric_id are freed. But if evsel__new_idx() returns NULL, it can lead to NULL pointer dereference. Fixes: cd63c22168257a0b ("perf parse-events: Minor __add_event refactoring") Signed-off-by: Faisal Bukhari Reviewed-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim --- tools/perf/util/parse-events.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 17c1c36a7bf9..000c89a1e50d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -251,8 +251,11 @@ __add_event(struct list_head *list, int *idx, event_attr_init(attr); evsel = evsel__new_idx(attr, *idx); - if (!evsel) - goto out_err; + if (!evsel) { + perf_cpu_map__put(cpus); + perf_cpu_map__put(pmu_cpus); + return NULL; + } if (name) { evsel->name = strdup(name); -- cgit v1.2.3 From cb3de96eea66f5e4a580086c6a1be46e765f97f4 Mon Sep 17 00:00:00 2001 From: Yumei Huang Date: Sun, 4 Jan 2026 11:23:57 +0800 Subject: ipv6: preserve insertion order for same-scope addresses IPv6 addresses with the same scope are returned in reverse insertion order, unlike IPv4. For example, when adding a -> b -> c, the list is reported as c -> b -> a, while IPv4 preserves the original order. This behavior causes: a. When using `ip -6 a save` and `ip -6 a restore`, addresses are restored in the opposite order from which they were saved. See example below showing addresses added as 1::1, 1::2, 1::3 but displayed and saved in reverse order. # ip -6 a a 1::1 dev x # ip -6 a a 1::2 dev x # ip -6 a a 1::3 dev x # ip -6 a s dev x 2: x: mtu 1500 qdisc noop state DOWN group default qlen 1000 inet6 1::3/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::2/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::1/128 scope global tentative valid_lft forever preferred_lft forever # ip -6 a save > dump # ip -6 a d 1::1 dev x # ip -6 a d 1::2 dev x # ip -6 a d 1::3 dev x # ip a d ::1 dev lo # ip a restore < dump # ip -6 a s dev x 2: x: mtu 1500 qdisc noop state DOWN group default qlen 1000 inet6 1::1/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::2/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::3/128 scope global tentative valid_lft forever preferred_lft forever # ip a showdump < dump if1: inet6 ::1/128 scope host proto kernel_lo valid_lft forever preferred_lft forever if2: inet6 1::3/128 scope global tentative valid_lft forever preferred_lft forever if2: inet6 1::2/128 scope global tentative valid_lft forever preferred_lft forever if2: inet6 1::1/128 scope global tentative valid_lft forever preferred_lft forever b. Addresses in pasta to appear in reversed order compared to host addresses. The ipv6 addresses were added in reverse order by commit e55ffac60117 ("[IPV6]: order addresses by scope"), then it was changed by commit 502a2ffd7376 ("ipv6: convert idev_list to list macros"), and restored by commit b54c9b98bbfb ("ipv6: Preserve pervious behavior in ipv6_link_dev_addr()."). However, this reverse ordering within the same scope causes inconsistency with IPv4 and the issues described above. This patch aligns IPv6 address ordering with IPv4 for consistency by changing the comparison from >= to > when inserting addresses into the address list. Also updates the ioam6 selftest to reflect the new address ordering behavior. Combine these two changes into one patch for bisectability. Link: https://bugs.passt.top/show_bug.cgi?id=175 Suggested-by: Stefano Brivio Signed-off-by: Yumei Huang Acked-by: Justin Iurman Reviewed-by: David Ahern Link: https://patch.msgid.link/20260104032357.38555-1-yuhuang@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 +- tools/testing/selftests/net/ioam6.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b66217d1b2f8..fe64988a23ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_for_each(p, &idev->addr_list) { struct inet6_ifaddr *ifa = list_entry(p, struct inet6_ifaddr, if_list); - if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) + if (ifp_scope > ipv6_addr_src_scope(&ifa->addr)) break; } diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index 845c26dd01a9..b2b99889942f 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -273,8 +273,8 @@ setup() ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null - ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null + ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha link set veth0 up &>/dev/null ip -netns $ioam_node_alpha link set lo up &>/dev/null ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \ -- cgit v1.2.3 From c4df070a57def243dcf5773428dd1b51bc8337ef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 4 Jan 2026 10:46:00 -0800 Subject: selftests: hw-net: rss-input-xfrm: try to enable the xfrm at the start The test currently SKIPs if the symmetric RSS xfrm is not enabled by default. This leads to spurious SKIPs in the Intel CI reporting results to NIPA. Testing on CX7: # ./drivers/net/hw/rss_input_xfrm.py TAP version 13 1..2 ok 1 rss_input_xfrm.test_rss_input_xfrm_ipv4 # SKIP Test requires IPv4 connectivity # Sym input xfrm already enabled: {'sym-or-xor'} ok 2 rss_input_xfrm.test_rss_input_xfrm_ipv6 # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:1 error:0 # ethtool -X eth0 xfrm none # ./drivers/net/hw/rss_input_xfrm.py TAP version 13 1..2 ok 1 rss_input_xfrm.test_rss_input_xfrm_ipv4 # SKIP Test requires IPv4 connectivity # Sym input xfrm configured: {'sym-or-xor'} ok 2 rss_input_xfrm.test_rss_input_xfrm_ipv6 # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:1 error:0 Link: https://patch.msgid.link/20260104184600.795280-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/rss_input_xfrm.py | 44 +++++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index 72880e388478..503f1a2a2872 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -5,9 +5,9 @@ import multiprocessing import socket from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, cmd, fd_read_timeout from lib.py import NetDrvEpEnv -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import KsftSkipEx, KsftFailEx -from lib.py import rand_port +from lib.py import defer, ksft_pr, rand_port def traffic(cfg, local_port, remote_port, ipver): @@ -21,6 +21,40 @@ def traffic(cfg, local_port, remote_port, ipver): return sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) +def _rss_input_xfrm_try_enable(cfg): + """ + Check if symmetric input-xfrm is already enabled, if not try to enable it + and register a cleanup. + """ + rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}) + orig_xfrm = rss.get('input-xfrm', set()) + sym_xfrm = set(filter(lambda x: 'sym' in x, orig_xfrm)) + + if sym_xfrm: + ksft_pr("Sym input xfrm already enabled:", sym_xfrm) + return sym_xfrm + + for xfrm in cfg.ethnl.consts["input-xfrm"].entries: + # Skip non-symmetric transforms + if "sym" not in xfrm: + continue + + try_xfrm = {xfrm} | orig_xfrm + try: + cfg.ethnl.rss_set({"header": {"dev-index": cfg.ifindex}, + "input-xfrm": try_xfrm}) + except NlError: + continue + + ksft_pr("Sym input xfrm configured:", try_xfrm) + defer(cfg.ethnl.rss_set, + {"header": {"dev-index": cfg.ifindex}, + "input-xfrm": orig_xfrm}) + return {xfrm} + + return set() + + def test_rss_input_xfrm(cfg, ipver): """ Test symmetric input_xfrm. @@ -37,12 +71,10 @@ def test_rss_input_xfrm(cfg, ipver): if not hasattr(socket, "SO_INCOMING_CPU"): raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") - rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}) - input_xfrm = set(filter(lambda x: 'sym' in x, rss.get('input-xfrm', {}))) - # Check for symmetric xor/or-xor + input_xfrm = _rss_input_xfrm_try_enable(cfg) if not input_xfrm: - raise KsftSkipEx("Symmetric RSS hash not requested") + raise KsftSkipEx("Symmetric RSS hash not supported by device") cpus = set() successful = 0 -- cgit v1.2.3 From 3b7a108c4197f9fd0b593c6b4b0de457d9ed4c87 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 5 Jan 2026 12:25:02 -0500 Subject: selftests/net: packetdrill: add minimal client and server tests Introduce minimal tests. These can serve as simple illustrative examples, and as templates when writing new tests. When adding new cases, it can be easier to extend an existing base test rather than start from scratch. The existing tests all focus on real, often non-trivial, features. It is not obvious which to take as starting point, and arguably none really qualify. Add two tests - the client test performs the active open and initial close - the server test implements the passive open and final close Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20260105172529.3514786-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- .../selftests/net/packetdrill/tcp_basic_client.pkt | 24 +++++++++++++++ .../selftests/net/packetdrill/tcp_basic_server.pkt | 35 ++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt (limited to 'tools') diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt new file mode 100644 index 000000000000..319f81dd717d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Minimal active open. +// First to close connection. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + + // Connect to server: active open: three-way handshake + +0...0 connect(4, ..., ...) = 0 + +0 > S 0:0(0) + +0 < S. 0:0(0) ack 1 win 65535 + +0 > . 1:1(0) ack 1 + + // Send data + +0 send(4, ..., 1000, 0) = 1000 + +0 > P. 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1001 win 257 + + +0 close(4) = 0 + +0 > F. 1001:1001(0) ack 1 + +0 < F. 1:1(0) ack 1002 win 257 + +0 > . 1002:1002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt new file mode 100644 index 000000000000..e72a291b666e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Minimal passive open. +// Peer is first to close. + +`./defaults.sh` + + // Open listener socket + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + // Incoming connection: passive open: three-way handshake + +0 < S 0:0(0) win 65535 + +0 > S. 0:0(0) ack 1 + +0 < . 1:1(0) ack 1 win 257 + + // Open connection socket and close listener socket + +0 accept(3, ..., ...) = 4 + +0 close(3) = 0 + + // Peer sends data: acknowledge and receive + +0 < P. 1:1001(1000) ack 1 win 257 + +0 > . 1:1(0) ack 1001 + +0 recv(4, ..., 1000, 0) = 1000 + + // Peer initiates connection close + +0 < F. 1001:1001(0) ack 1 win 257 + +.04 > . 1:1(0) ack 1002 + + // Local socket also closes its side + +0 close(4) = 0 + +0 > F. 1:1(0) ack 1002 + +0 < . 1002:1002(0) ack 2 win 257 -- cgit v1.2.3 From b81d5e9d965e0af2c1f21fc392a23e598171f9d6 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:45 -0500 Subject: selftests/bpf: add tests for arena kfuncs under lock Add selftests to ensure the verifier permits calling the arena kfunc API while holding a lock. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-3-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena.c | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 4a9d96344813..c4b8daac4388 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -10,6 +10,8 @@ #include "bpf_experimental.h" #include "bpf_arena_common.h" +#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) + struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); @@ -439,4 +441,40 @@ int iter_maps3(struct bpf_iter__bpf_map *ctx) return 0; } +private(ARENA_TESTS) struct bpf_spin_lock arena_bpf_test_lock; + +/* Use the arena kfunc API while under a BPF lock. */ +SEC("syscall") +__success __retval(0) +int arena_kfuncs_under_bpf_lock(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + bpf_spin_lock(&arena_bpf_test_lock); + + /* Get a separate region of the arena. */ + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 1; + } + + bpf_arena_free_pages(&arena, page, 1); + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 2; + } + + bpf_arena_free_pages(&arena, page, 1); + + bpf_spin_unlock(&arena_bpf_test_lock); +#endif + + return 0; +} char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 31 +++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 43 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6dd2ad2f9e81..e8cfe9d67e64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) -{ - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) +{ + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b92b0847ec2..b816bc53d2e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ -- cgit v1.2.3 From 2546863b4a723c96f55af7127827d62632cfbc9c Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:21 +0800 Subject: libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps Add libbpf support for the BPF_F_CPU flag for percpu maps by embedding the cpu info into the high 32 bits of: 1. **flags**: bpf_map_lookup_elem_flags(), bpf_map__lookup_elem(), bpf_map_update_elem() and bpf_map__update_elem() 2. **opts->elem_flags**: bpf_map_lookup_batch() and bpf_map_update_batch() And the flag can be BPF_F_ALL_CPUS, but cannot be 'BPF_F_CPU | BPF_F_ALL_CPUS'. Behavior: * If the flag is BPF_F_ALL_CPUS, the update is applied across all CPUs. * If the flag is BPF_F_CPU, it updates value only to the specified CPU. * If the flag is BPF_F_CPU, lookup value only from the specified CPU. * lookup does not support BPF_F_ALL_CPUS. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-7-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.h | 8 ++++++++ tools/lib/bpf/libbpf.c | 26 ++++++++++++++++++++------ tools/lib/bpf/libbpf.h | 21 ++++++++------------- 3 files changed, 36 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 1f9c28d27795..2c8e88ddb674 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -289,6 +289,14 @@ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, * Update spin_lock-ed map elements. This must be * specified if the map value contains a spinlock. * + * **BPF_F_CPU** + * As for percpu maps, update value on the specified CPU. And the cpu + * info is embedded into the high 32 bits of **opts->elem_flags**. + * + * **BPF_F_ALL_CPUS** + * As for percpu maps, update value across all CPUs. This flag cannot + * be used with BPF_F_CPU at the same time. + * * @param fd BPF map file descriptor * @param keys pointer to an array of *count* keys * @param values pointer to an array of *count* values diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1a52d818a76c..6ea81701e274 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10919,7 +10919,7 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) } static int validate_map_op(const struct bpf_map *map, size_t key_sz, - size_t value_sz, bool check_value_sz) + size_t value_sz, bool check_value_sz, __u64 flags) { if (!map_is_created(map)) /* map is not yet created */ return -ENOENT; @@ -10946,6 +10946,20 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, int num_cpu = libbpf_num_possible_cpus(); size_t elem_sz = roundup(map->def.value_size, 8); + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) { + pr_warn("map '%s': BPF_F_CPU and BPF_F_ALL_CPUS are mutually exclusive\n", + map->name); + return -EINVAL; + } + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided for either BPF_F_CPU or BPF_F_ALL_CPUS, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + if (value_sz != num_cpu * elem_sz) { pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); @@ -10970,7 +10984,7 @@ int bpf_map__lookup_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -10983,7 +10997,7 @@ int bpf_map__update_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -10995,7 +11009,7 @@ int bpf_map__delete_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, flags); if (err) return libbpf_err(err); @@ -11008,7 +11022,7 @@ int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -11020,7 +11034,7 @@ int bpf_map__get_next_key(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, 0); if (err) return libbpf_err(err); diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e14d9e349f9c..dfc37a615578 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1216,12 +1216,13 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** * @param value pointer to memory in which looked up value will be stored * @param value_sz size in byte of value data memory; it has to match BPF map - * definition's **value_size**. For per-CPU BPF maps value size has to be - * a product of BPF map value size and number of possible CPUs in the system - * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for - * per-CPU values value size has to be aligned up to closest 8 bytes for - * alignment reasons, so expected size is: `round_up(value_size, 8) - * * libbpf_num_possible_cpus()`. + * definition's **value_size**. For per-CPU BPF maps, value size can be + * `value_size` if either **BPF_F_CPU** or **BPF_F_ALL_CPUS** is specified + * in **flags**, otherwise a product of BPF map value size and number of + * possible CPUs in the system (could be fetched with + * **libbpf_num_possible_cpus()**). Note also that for per-CPU values value + * size has to be aligned up to closest 8 bytes, so expected size is: + * `round_up(value_size, 8) * libbpf_num_possible_cpus()`. * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * @@ -1239,13 +1240,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, * @param key pointer to memory containing bytes of the key * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** * @param value pointer to memory containing bytes of the value - * @param value_sz size in byte of value data memory; it has to match BPF map - * definition's **value_size**. For per-CPU BPF maps value size has to be - * a product of BPF map value size and number of possible CPUs in the system - * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for - * per-CPU values value size has to be aligned up to closest 8 bytes for - * alignment reasons, so expected size is: `round_up(value_size, 8) - * * libbpf_num_possible_cpus()`. + * @param value_sz refer to **bpf_map__lookup_elem**'s description.' * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * -- cgit v1.2.3 From 07bf7aa58e5e7fb27b8addcc33052400a7d9ce32 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:22 +0800 Subject: selftests/bpf: Add cases to test BPF_F_CPU and BPF_F_ALL_CPUS flags Add test coverage for the new BPF_F_CPU and BPF_F_ALL_CPUS flags support in percpu maps. The following APIs are exercised: * bpf_map_update_batch() * bpf_map_lookup_batch() * bpf_map_update_elem() * bpf_map__update_elem() * bpf_map_lookup_elem_flags() * bpf_map__lookup_elem() For lru_percpu_hash map, set max_entries to 'libbpf_num_possible_cpus() + 1' and only use the first 'libbpf_num_possible_cpus()' entries. This ensures a spare entry is always available in the LRU free list, avoiding eviction. When updating an existing key in lru_percpu_hash map: 1. l_new = prealloc_lru_pop(); /* Borrow from free list */ 2. l_old = lookup_elem_raw(); /* Found, key exists */ 3. pcpu_copy_value(); /* In-place update */ 4. bpf_lru_push_free(); /* Return l_new to free list */ Also add negative tests to verify that non-percpu array and hash maps reject the BPF_F_CPU and BPF_F_ALL_CPUS flags. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-8-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/percpu_alloc.c | 328 +++++++++++++++++++++ .../selftests/bpf/progs/percpu_alloc_array.c | 32 ++ 2 files changed, 360 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index 343da65864d6..c1d0949f093f 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "cgroup_helpers.h" #include "percpu_alloc_array.skel.h" #include "percpu_alloc_cgrp_local_storage.skel.h" #include "percpu_alloc_fail.skel.h" @@ -115,6 +116,321 @@ static void test_failure(void) { RUN_TESTS(percpu_alloc_fail); } +static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t key_sz, u32 entries, + int nr_cpus, bool test_batch) +{ + size_t value_sz = sizeof(u32), value_sz_cpus, value_sz_total; + u32 *values = NULL, *values_percpu = NULL; + const u32 value = 0xDEADC0DE; + int i, j, cpu, map_fd, err; + u64 batch = 0, flags; + void *values_row; + u32 count, v; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + value_sz_cpus = value_sz * nr_cpus; + values = calloc(entries, value_sz_cpus); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + + values_percpu = calloc(entries, roundup(value_sz, 8) * nr_cpus); + if (!ASSERT_OK_PTR(values_percpu, "calloc values_percpu")) { + free(values); + return; + } + + value_sz_total = value_sz_cpus * entries; + memset(values, 0, value_sz_total); + + map_fd = bpf_map__fd(map); + flags = BPF_F_CPU | BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu|all_cpus")) + goto out; + + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem cpu|all_cpus")) + goto out; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags all_cpus")) + goto out; + + flags = BPF_F_LOCK | BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags BPF_F_LOCK")) + goto out; + + flags = BPF_F_LOCK | BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem BPF_F_LOCK")) + goto out; + + flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_elem -ERANGE")) + goto out; + + err = bpf_map__update_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__update_elem -ERANGE")) + goto out; + + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_lookup_elem_flags -ERANGE")) + goto out; + + err = bpf_map__lookup_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__lookup_elem -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + /* clear value on all cpus */ + values[0] = 0; + flags = BPF_F_ALL_CPUS; + for (i = 0; i < entries; i++) { + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem all_cpus")) + goto out; + } + + /* update value on specified cpu */ + for (i = 0; i < entries; i++) { + values[0] = value; + flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem specified cpu")) + goto out; + + /* lookup then check value on CPUs */ + for (j = 0; j < nr_cpus; j++) { + flags = (u64)j << 32 | BPF_F_CPU; + err = bpf_map__lookup_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__lookup_elem specified cpu")) + goto out; + if (!ASSERT_EQ(values[0], j != cpu ? 0 : value, + "bpf_map__lookup_elem value on specified cpu")) + goto out; + } + } + } + + if (!test_batch) + goto out; + + count = entries; + batch_opts.elem_flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_batch -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + memset(values, 0, value_sz_total); + + /* clear values across all CPUs */ + count = entries; + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) + goto out; + + /* update values on specified CPU */ + for (i = 0; i < entries; i++) + values[i] = value; + + count = entries; + batch_opts.elem_flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) + goto out; + + /* lookup values on specified CPU */ + batch = 0; + count = entries; + memset(values, 0, entries * value_sz); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) + goto out; + + for (i = 0; i < entries; i++) + if (!ASSERT_EQ(values[i], value, + "bpf_map_lookup_batch value on specified cpu")) + goto out; + + /* lookup values from all CPUs */ + batch = 0; + count = entries; + batch_opts.elem_flags = 0; + memset(values_percpu, 0, roundup(value_sz, 8) * nr_cpus * entries); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values_percpu, &count, + &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) + goto out; + + for (i = 0; i < entries; i++) { + values_row = (void *) values_percpu + + roundup(value_sz, 8) * i * nr_cpus; + for (j = 0; j < nr_cpus; j++) { + v = *(u32 *) (values_row + roundup(value_sz, 8) * j); + if (!ASSERT_EQ(v, j != cpu ? 0 : value, + "bpf_map_lookup_batch value all_cpus")) + goto out; + } + } + } + +out: + free(values_percpu); + free(values); +} + + +static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) +{ + struct percpu_alloc_array *skel; + size_t key_sz = sizeof(int); + int *keys, nr_cpus, i, err; + struct bpf_map *map; + u32 max_entries; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + max_entries = nr_cpus + 1; + keys = calloc(max_entries, key_sz); + if (!ASSERT_OK_PTR(keys, "calloc keys")) + return; + + for (i = 0; i < max_entries; i++) + keys[i] = i; + + skel = percpu_alloc_array__open(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open")) { + free(keys); + return; + } + + map = skel->maps.percpu; + bpf_map__set_type(map, map_type); + bpf_map__set_max_entries(map, max_entries); + + err = percpu_alloc_array__load(skel); + if (!ASSERT_OK(err, "test_percpu_alloc__load")) + goto out; + + test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true); +out: + percpu_alloc_array__destroy(skel); + free(keys); +} + +static void test_percpu_array_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_ARRAY); +} + +static void test_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_HASH); +} + +static void test_lru_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_LRU_PERCPU_HASH); +} + +static void test_percpu_cgroup_storage_cpu_flag(void) +{ + struct percpu_alloc_array *skel = NULL; + struct bpf_cgroup_storage_key key; + int cgroup, prog_fd, nr_cpus, err; + struct bpf_map *map; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + return; + + cgroup = create_and_get_cgroup("/cg_percpu"); + if (!ASSERT_GE(cgroup, 0, "create_and_get_cgroup")) { + cleanup_cgroup_environment(); + return; + } + + err = join_cgroup("/cg_percpu"); + if (!ASSERT_OK(err, "join_cgroup")) + goto out; + + skel = percpu_alloc_array__open_and_load(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.cgroup_egress); + err = bpf_prog_attach(prog_fd, cgroup, BPF_CGROUP_INET_EGRESS, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + map = skel->maps.percpu_cgroup_storage; + err = bpf_map_get_next_key(bpf_map__fd(map), NULL, &key); + if (!ASSERT_OK(err, "bpf_map_get_next_key")) + goto out; + + test_percpu_map_op_cpu_flag(map, &key, sizeof(key), 1, nr_cpus, false); +out: + bpf_prog_detach2(-1, cgroup, BPF_CGROUP_INET_EGRESS); + close(cgroup); + cleanup_cgroup_environment(); + percpu_alloc_array__destroy(skel); +} + +static void test_map_op_cpu_flag(enum bpf_map_type map_type) +{ + u32 max_entries = 1, count = max_entries; + u64 flags, batch = 0, val = 0; + int err, map_fd, key = 0; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + map_fd = bpf_map_create(map_type, "test_cpu_flag", sizeof(int), sizeof(u64), max_entries, + NULL); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) + return; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_update_elem all_cpus"); + + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_update_batch all_cpus"); + + flags = BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu"); + + batch_opts.elem_flags = BPF_F_CPU; + err = bpf_map_lookup_batch(map_fd, NULL, &batch, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_lookup_batch cpu"); + + close(map_fd); +} + +static void test_array_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_ARRAY); +} + +static void test_hash_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_HASH); +} + void test_percpu_alloc(void) { if (test__start_subtest("array")) @@ -125,4 +441,16 @@ void test_percpu_alloc(void) test_cgrp_local_storage(); if (test__start_subtest("failure_tests")) test_failure(); + if (test__start_subtest("cpu_flag_percpu_array")) + test_percpu_array_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_hash")) + test_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_lru_percpu_hash")) + test_lru_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_cgroup_storage")) + test_percpu_cgroup_storage_cpu_flag(); + if (test__start_subtest("cpu_flag_array")) + test_array_cpu_flag(); + if (test__start_subtest("cpu_flag_hash")) + test_hash_cpu_flag(); } diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c index 37c2d2608ec0..ed6a2a93d5a5 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c @@ -187,4 +187,36 @@ out: return 0; } +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, u32); +} percpu SEC(".maps"); + +SEC("?fentry/bpf_fentry_test1") +int BPF_PROG(test_percpu_array, int x) +{ + u64 value = 0xDEADC0DE; + int key = 0; + + bpf_map_update_elem(&percpu, &key, &value, BPF_ANY); + return 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, u32); +} percpu_cgroup_storage SEC(".maps"); + +SEC("cgroup_skb/egress") +int cgroup_egress(struct __sk_buff *skb) +{ + u32 *val = bpf_get_local_storage(&percpu_cgroup_storage, 0); + + *val = 1; + return 1; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 97fb54d86d2194ea8a4cbe6cf074e6ba47b054ea Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Jan 2026 18:36:49 +0100 Subject: bpf: adapt selftests to GCC 16 -Wunused-but-set-variable GCC 16 has changed the semantics of -Wunused-but-set-variable, as well as introducing new options -Wunused-but-set-variable={0,1,2,3} to adjust the level of support. One of the changes is that GCC now treats 'sum += 1' and 'sum++' as non-usage, whereas clang (and GCC < 16) considers the first as usage and the second as non-usage, which is sort of inconsistent. The GCC 16 -Wunused-but-set-variable=2 option implements the previous semantics of -Wunused-but-set-variable, but since it is a new option, it cannot be used unconditionally for forward-compatibility, just for backwards-compatibility. So this patch adds pragmas to the two self-tests impacted by this, progs/free_timer.c and progs/rcu_read_lock.c, to make gcc to ignore -Wunused-but-set-variable warnings when compiling them with GCC > 15. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44677#c25 for details on why this regression got introduced in GCC upstream. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20260106173650.18191-2-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/free_timer.c | 10 ++++++++++ tools/testing/selftests/bpf/progs/rcu_read_lock.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/free_timer.c b/tools/testing/selftests/bpf/progs/free_timer.c index 4501ae8fc414..eccb2d47db43 100644 --- a/tools/testing/selftests/bpf/progs/free_timer.c +++ b/tools/testing/selftests/bpf/progs/free_timer.c @@ -7,6 +7,16 @@ #define MAX_ENTRIES 8 +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + struct map_value { struct bpf_timer timer; }; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index d70c28824bbe..b4e073168fb1 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -7,6 +7,16 @@ #include "bpf_tracing_net.h" #include "bpf_misc.h" +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + char _license[] SEC("license") = "GPL"; struct { -- cgit v1.2.3 From 681600647c59050546939da5e490c736e567fe91 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Jan 2026 18:36:50 +0100 Subject: bpf: GCC requires function attributes before the declarator GCC insists in placing attributes before the declarators in function declarations. Now that GCC supports btf_decl_tag and therefore __tag1 and __tag2 expand to actual attributes, the compiler is complaining about it for static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 progs/test_btf_decl_tag.c:36:1: error: attributes should be specified \ before the declarator in a function definition This patch simply places the tags before the declarator. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20260106173650.18191-3-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_btf_decl_tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c index c88ccc53529a..0c3df19626cb 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c +++ b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c @@ -33,7 +33,7 @@ struct { } hashmap1 SEC(".maps"); -static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 +static __noinline __tag1 __tag2 int foo(int x __tag1 __tag2) { struct key_t key; value_t val = {}; -- cgit v1.2.3 From c219d4ee1d63b772d5fa8ed453b9cec18a9e2f6a Mon Sep 17 00:00:00 2001 From: Crystal Wood Date: Wed, 12 Nov 2025 09:25:29 -0600 Subject: rtla: Set stop threshold after all instances are enabled This avoids startup races where one of the instances hit a threshold before all instances were enabled, and thus tracing stops without the relevant event. In particular, this is not uncommon with the tests that set a very tight threshold and then complain if there's no analysis. This also ensures that we don't stop tracing during a warmup. The downside is a small chance of having an event over the threshold early in the output, without stopping on it, which could cause user confusion. This should be less likely if the warmup feature is used, but that doesn't eliminate the race window, just the odds of an unusual spike right at that moment. Signed-off-by: Crystal Wood Link: https://lore.kernel.org/r/20251112152529.956778-6-crwood@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 20 ++++++++++++++++++++ tools/tracing/rtla/src/common.h | 4 ++++ tools/tracing/rtla/src/osnoise.c | 17 ++++------------- tools/tracing/rtla/src/osnoise.h | 5 ----- tools/tracing/rtla/src/timerlat.c | 29 ++++++++++------------------- 5 files changed, 38 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index b197037fc58b..46e0263d6ae8 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -348,3 +348,23 @@ int hist_main_loop(struct osnoise_tool *tool) return retval; } + +int osn_set_stop(struct osnoise_tool *tool) +{ + struct common_params *params = tool->params; + int retval; + + retval = osnoise_set_stop_us(tool->context, params->stop_us); + if (retval) { + err_msg("Failed to set stop us\n"); + return retval; + } + + retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us); + if (retval) { + err_msg("Failed to set stop total us\n"); + return retval; + } + + return 0; +} diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h index 9ec2b7632c37..c5e73d4600a0 100644 --- a/tools/tracing/rtla/src/common.h +++ b/tools/tracing/rtla/src/common.h @@ -152,7 +152,11 @@ void osnoise_destroy_tool(struct osnoise_tool *top); struct osnoise_tool *osnoise_init_tool(char *tool_name); struct osnoise_tool *osnoise_init_trace_tool(const char *tracer); bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record); +int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us); +int osnoise_set_stop_total_us(struct osnoise_context *context, + long long stop_total_us); int common_apply_config(struct osnoise_tool *tool, struct common_params *params); int top_main_loop(struct osnoise_tool *tool); int hist_main_loop(struct osnoise_tool *tool); +int osn_set_stop(struct osnoise_tool *tool); diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c index 312c511fa004..945eb61efc46 100644 --- a/tools/tracing/rtla/src/osnoise.c +++ b/tools/tracing/rtla/src/osnoise.c @@ -1128,18 +1128,6 @@ osnoise_apply_config(struct osnoise_tool *tool, struct osnoise_params *params) goto out_err; } - retval = osnoise_set_stop_us(tool->context, params->common.stop_us); - if (retval) { - err_msg("Failed to set stop us\n"); - goto out_err; - } - - retval = osnoise_set_stop_total_us(tool->context, params->common.stop_total_us); - if (retval) { - err_msg("Failed to set stop total us\n"); - goto out_err; - } - retval = osnoise_set_tracing_thresh(tool->context, params->threshold); if (retval) { err_msg("Failed to set tracing_thresh\n"); @@ -1184,9 +1172,12 @@ int osnoise_enable(struct osnoise_tool *tool) debug_msg("Error cleaning up the buffer"); return retval; } - } + retval = osn_set_stop(tool); + if (retval) + return retval; + return 0; } diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h index 75de0d5c706a..168669aa7e0d 100644 --- a/tools/tracing/rtla/src/osnoise.h +++ b/tools/tracing/rtla/src/osnoise.h @@ -34,12 +34,7 @@ int osnoise_set_runtime_period(struct osnoise_context *context, unsigned long long period); void osnoise_restore_runtime_period(struct osnoise_context *context); -int osnoise_set_stop_us(struct osnoise_context *context, - long long stop_us); void osnoise_restore_stop_us(struct osnoise_context *context); - -int osnoise_set_stop_total_us(struct osnoise_context *context, - long long stop_total_us); void osnoise_restore_stop_total_us(struct osnoise_context *context); int osnoise_set_timerlat_period_us(struct osnoise_context *context, diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c index df4f9bfe3433..ee15e344cf37 100644 --- a/tools/tracing/rtla/src/timerlat.c +++ b/tools/tracing/rtla/src/timerlat.c @@ -48,25 +48,6 @@ timerlat_apply_config(struct osnoise_tool *tool, struct timerlat_params *params) } } - if (params->mode != TRACING_MODE_BPF) { - /* - * In tracefs and mixed mode, timerlat tracer handles stopping - * on threshold - */ - retval = osnoise_set_stop_us(tool->context, params->common.stop_us); - if (retval) { - err_msg("Failed to set stop us\n"); - goto out_err; - } - - retval = osnoise_set_stop_total_us(tool->context, params->common.stop_total_us); - if (retval) { - err_msg("Failed to set stop total us\n"); - goto out_err; - } - } - - retval = osnoise_set_timerlat_period_us(tool->context, params->timerlat_period_us ? params->timerlat_period_us : @@ -184,6 +165,16 @@ int timerlat_enable(struct osnoise_tool *tool) } } + /* + * In tracefs and mixed mode, timerlat tracer handles stopping + * on threshold + */ + if (params->mode != TRACING_MODE_BPF) { + retval = osn_set_stop(tool); + if (retval) + return retval; + } + return 0; } -- cgit v1.2.3 From a08e012e814d346c191726a877b18901c3bc204f Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Mon, 24 Nov 2025 08:31:46 +0200 Subject: tools/rtla: Add common_usage() The rtla tools have significant code quadruplication in their usage functions. Each tool implements its own version of the same help text formatting and option descriptions, leading to maintenance overhead and inconsistencies. Documentation/tools/rtla/common_options.rst lists 14 common options. Add common_usage() infrastructure to consolidate help formatting. Subsequent patches will extend this to handle other common options. The refactored output is almost identical to the original, with the following changes: - add square brackets to specify optionality: `usage: [rtla] ...` - remove `-q` from timerlat hist because hist tools don't support it - minor spacing Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251124063204.845425-1-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 39 ++++++++++++++++++++++++++++++++++ tools/tracing/rtla/src/common.h | 3 +++ tools/tracing/rtla/src/osnoise_hist.c | 22 +++++++------------ tools/tracing/rtla/src/osnoise_top.c | 37 +++++++++++++------------------- tools/tracing/rtla/src/timerlat_hist.c | 22 +++++++------------ tools/tracing/rtla/src/timerlat_top.c | 22 +++++++------------ 6 files changed, 81 insertions(+), 64 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 46e0263d6ae8..009a4bce9737 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include "common.h" @@ -368,3 +369,41 @@ int osn_set_stop(struct osnoise_tool *tool) return 0; } + +static void print_msg_array(const char * const *msgs) +{ + if (!msgs) + return; + + for (int i = 0; msgs[i]; i++) + fprintf(stderr, "%s\n", msgs[i]); +} + +/* + * common_usage - print complete usage information + */ +void common_usage(const char *tool, const char *mode, + const char *desc, const char * const *start_msgs, const char * const *opt_msgs) +{ + static const char * const common_options[] = { + " -h/--help: print this menu", + NULL + }; + fprintf(stderr, "rtla %s", tool); + if (strcmp(mode, "")) + fprintf(stderr, " %s", mode); + fprintf(stderr, ": %s (version %s)\n\n", desc, VERSION); + fprintf(stderr, " usage: [rtla] %s ", tool); + + if (strcmp(mode, "top") == 0) + fprintf(stderr, "[top] [-h] "); + else + fprintf(stderr, "%s [-h] ", mode); + + print_msg_array(start_msgs); + fprintf(stderr, "\n"); + print_msg_array(common_options); + print_msg_array(opt_msgs); + + exit(EXIT_SUCCESS); +} diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h index c5e73d4600a0..c48c9bfd20e3 100644 --- a/tools/tracing/rtla/src/common.h +++ b/tools/tracing/rtla/src/common.h @@ -160,3 +160,6 @@ int common_apply_config(struct osnoise_tool *tool, struct common_params *params) int top_main_loop(struct osnoise_tool *tool); int hist_main_loop(struct osnoise_tool *tool); int osn_set_stop(struct osnoise_tool *tool); + +void common_usage(const char *tool, const char *mode, + const char *desc, const char * const *start_msgs, const char * const *opt_msgs); diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index ff8c231e47c4..372128db9e4a 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -409,16 +409,15 @@ osnoise_print_stats(struct osnoise_tool *tool) */ static void osnoise_hist_usage(void) { - int i; - - static const char * const msg[] = { - "", - " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", + static const char * const msg_start[] = { + "[-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", " [-T us] [-t [file]] [-e sys[:event]] [--filter ] [--trigger ] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [-C [cgroup_name]] [--warm-up]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char * const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", " -p/--period us: osnoise period in us", " -r/--runtime us: osnoise runtime in us", @@ -453,13 +452,8 @@ static void osnoise_hist_usage(void) NULL, }; - fprintf(stderr, "rtla osnoise hist: a per-cpu histogram of the OS noise (version %s)\n", - VERSION); - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage("osnoise", "hist", "a per-cpu histogram of the OS noise", + msg_start, msg_opts); } /* diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 04c699bdd736..1db1d946b600 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -257,14 +257,16 @@ osnoise_print_stats(struct osnoise_tool *top) */ static void osnoise_top_usage(struct osnoise_params *params) { - int i; + const char *tool, *mode, *desc; - static const char * const msg[] = { - " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", + static const char * const msg_start[] = { + "[-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", " [-T us] [-t [file]] [-e sys[:event]] [--filter ] [--trigger ] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-C [cgroup_name]] [--warm-up s]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char * const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", " -p/--period us: osnoise period in us", " -r/--runtime us: osnoise runtime in us", @@ -295,25 +297,16 @@ static void osnoise_top_usage(struct osnoise_params *params) }; if (params->mode == MODE_OSNOISE) { - fprintf(stderr, - "rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n", - VERSION); - - fprintf(stderr, " usage: rtla osnoise [top]"); + tool = "osnoise"; + mode = "top"; + desc = "a per-cpu summary of the OS noise"; + } else { + tool = "hwnoise"; + mode = ""; + desc = "a summary of hardware-related noise"; } - if (params->mode == MODE_HWNOISE) { - fprintf(stderr, - "rtla hwnoise: a summary of hardware-related noise (version %s)\n", - VERSION); - - fprintf(stderr, " usage: rtla hwnoise"); - } - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage(tool, mode, desc, msg_start, msg_opts); } /* diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 1fb471a787b7..2a5c543217ba 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -696,17 +696,16 @@ timerlat_print_stats(struct osnoise_tool *tool) */ static void timerlat_hist_usage(void) { - int i; - - char *msg[] = { - "", - " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", + static const char * const msg_start[] = { + "[-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", " [-t [file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [--dma-latency us] [-C [cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", " [--warm-up s] [--deepest-idle-state n]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char * const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", " -p/--period us: timerlat period in us", " -i/--irq us: stop trace if the irq latency is higher than the argument in us", @@ -750,13 +749,8 @@ static void timerlat_hist_usage(void) NULL, }; - fprintf(stderr, "rtla timerlat hist: a per-cpu histogram of the timer latency (version %s)\n", - VERSION); - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage("timerlat", "hist", "a per-cpu histogram of the timer latency", + msg_start, msg_opts); } /* diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 29c2c1f717ed..9ed8b931552f 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -476,15 +476,14 @@ timerlat_print_stats(struct osnoise_tool *top) */ static void timerlat_top_usage(void) { - int i; - - static const char *const msg[] = { - "", - " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", + static const char *const msg_start[] = { + "[-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", " [[-t [file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [--dma-latency us] [--aa-only us] [-C [cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char *const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", " --aa-only us: stop if latency is hit, only printing the auto analysis (reduces CPU usage)", " -p/--period us: timerlat period in us", @@ -522,13 +521,8 @@ static void timerlat_top_usage(void) NULL, }; - fprintf(stderr, "rtla timerlat top: a per-cpu summary of the timer latency (version %s)\n", - VERSION); - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage("timerlat", "top", "a per-cpu summary of the timer latency", + msg_start, msg_opts); } /* -- cgit v1.2.3 From 8cd0f08ac72e25e2a048c72d76730676ab0106f3 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 26 Nov 2025 15:41:59 +0100 Subject: rtla/timerlat: Support tail call from BPF program Add a map to the rtla-timerlat BPF program that holds a file descriptor of another BPF program, to be executed on threshold overflow. timerlat_bpf_set_action() is added as an interface to set the program. Link: https://lore.kernel.org/r/20251126144205.331954-2-tglozar@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/timerlat.bpf.c | 25 +++++++++++++++++++++---- tools/tracing/rtla/src/timerlat_bpf.c | 13 +++++++++++++ tools/tracing/rtla/src/timerlat_bpf.h | 1 + 3 files changed, 35 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat.bpf.c b/tools/tracing/rtla/src/timerlat.bpf.c index e2265b5d6491..549d2d2191d2 100644 --- a/tools/tracing/rtla/src/timerlat.bpf.c +++ b/tools/tracing/rtla/src/timerlat.bpf.c @@ -40,6 +40,17 @@ struct { __uint(max_entries, 1); } signal_stop_tracing SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(unsigned int)); + __uint(max_entries, 1); + __array(values, unsigned int (void *)); +} bpf_action SEC(".maps") = { + .values = { + [0] = 0 + }, +}; + /* Params to be set by rtla */ const volatile int bucket_size = 1; const volatile int output_divisor = 1000; @@ -109,7 +120,7 @@ nosubprog void update_summary(void *map, map_set(map, SUMMARY_SUM, map_get(map, SUMMARY_SUM) + latency); } -nosubprog void set_stop_tracing(void) +nosubprog void set_stop_tracing(struct trace_event_raw_timerlat_sample *tp_args) { int value = 0; @@ -118,6 +129,12 @@ nosubprog void set_stop_tracing(void) /* Signal to userspace */ bpf_ringbuf_output(&signal_stop_tracing, &value, sizeof(value), 0); + + /* + * Call into BPF action program, if attached. + * Otherwise, just silently fail. + */ + bpf_tail_call(tp_args, &bpf_action, 0); } SEC("tp/osnoise/timerlat_sample") @@ -138,19 +155,19 @@ int handle_timerlat_sample(struct trace_event_raw_timerlat_sample *tp_args) update_summary(&summary_irq, latency, bucket); if (irq_threshold != 0 && latency_us >= irq_threshold) - set_stop_tracing(); + set_stop_tracing(tp_args); } else if (tp_args->context == 1) { update_main_hist(&hist_thread, bucket); update_summary(&summary_thread, latency, bucket); if (thread_threshold != 0 && latency_us >= thread_threshold) - set_stop_tracing(); + set_stop_tracing(tp_args); } else { update_main_hist(&hist_user, bucket); update_summary(&summary_user, latency, bucket); if (thread_threshold != 0 && latency_us >= thread_threshold) - set_stop_tracing(); + set_stop_tracing(tp_args); } return 0; diff --git a/tools/tracing/rtla/src/timerlat_bpf.c b/tools/tracing/rtla/src/timerlat_bpf.c index e97d16646bcd..1d619e502c65 100644 --- a/tools/tracing/rtla/src/timerlat_bpf.c +++ b/tools/tracing/rtla/src/timerlat_bpf.c @@ -59,6 +59,19 @@ int timerlat_bpf_init(struct timerlat_params *params) return 0; } +/* + * timerlat_bpf_set_action - set action on threshold executed on BPF side + */ +static int timerlat_bpf_set_action(struct bpf_program *prog) +{ + unsigned int key = 0, value = bpf_program__fd(prog); + + return bpf_map__update_elem(bpf->maps.bpf_action, + &key, sizeof(key), + &value, sizeof(value), + BPF_ANY); +} + /* * timerlat_bpf_attach - attach BPF program to collect timerlat data */ diff --git a/tools/tracing/rtla/src/timerlat_bpf.h b/tools/tracing/rtla/src/timerlat_bpf.h index 118487436d30..b5009092c7a3 100644 --- a/tools/tracing/rtla/src/timerlat_bpf.h +++ b/tools/tracing/rtla/src/timerlat_bpf.h @@ -12,6 +12,7 @@ enum summary_field { }; #ifndef __bpf__ +#include #ifdef HAVE_BPF_SKEL int timerlat_bpf_init(struct timerlat_params *params); int timerlat_bpf_attach(void); -- cgit v1.2.3 From f967d1eca7d0bde7c896014577ea876096831c6e Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 26 Nov 2025 15:42:00 +0100 Subject: rtla/timerlat: Add --bpf-action option Add option --bpf-action that allows the user to attach an external BPF program that will be executed via BPF tail call on latency threshold overflow. Executing additional BPF code on latency threshold overflow allows doing low-latency and in-kernel troubleshooting of the cause of the overflow. The option takes an argument, which is a path to a BPF ELF file expected to contain a function named "action_handler" in a section named "tp/timerlat_action" (the section is necessary for libbpf to assign the correct BPF program type to it). Link: https://lore.kernel.org/r/20251126144205.331954-3-tglozar@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/timerlat.c | 11 +++++++ tools/tracing/rtla/src/timerlat.h | 2 +- tools/tracing/rtla/src/timerlat_bpf.c | 53 ++++++++++++++++++++++++++++++++++ tools/tracing/rtla/src/timerlat_bpf.h | 6 +++- tools/tracing/rtla/src/timerlat_hist.c | 5 ++++ tools/tracing/rtla/src/timerlat_top.c | 5 ++++ 6 files changed, 80 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c index ee15e344cf37..8f6cf55f4a94 100644 --- a/tools/tracing/rtla/src/timerlat.c +++ b/tools/tracing/rtla/src/timerlat.c @@ -48,6 +48,17 @@ timerlat_apply_config(struct osnoise_tool *tool, struct timerlat_params *params) } } + /* Check if BPF action program is requested but BPF is not available */ + if (params->bpf_action_program) { + if (params->mode == TRACING_MODE_TRACEFS) { + err_msg("BPF actions are not supported in tracefs-only mode\n"); + goto out_err; + } + + if (timerlat_load_bpf_action_program(params->bpf_action_program)) + goto out_err; + } + retval = osnoise_set_timerlat_period_us(tool->context, params->timerlat_period_us ? params->timerlat_period_us : diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h index fd6065f48bb7..8dd5d134ce08 100644 --- a/tools/tracing/rtla/src/timerlat.h +++ b/tools/tracing/rtla/src/timerlat.h @@ -27,6 +27,7 @@ struct timerlat_params { int dump_tasks; int deepest_idle_state; enum timerlat_tracing_mode mode; + const char *bpf_action_program; }; #define to_timerlat_params(ptr) container_of(ptr, struct timerlat_params, common) @@ -36,4 +37,3 @@ int timerlat_main(int argc, char *argv[]); int timerlat_enable(struct osnoise_tool *tool); void timerlat_analyze(struct osnoise_tool *tool, bool stopped); void timerlat_free(struct osnoise_tool *tool); - diff --git a/tools/tracing/rtla/src/timerlat_bpf.c b/tools/tracing/rtla/src/timerlat_bpf.c index 1d619e502c65..05adf18303df 100644 --- a/tools/tracing/rtla/src/timerlat_bpf.c +++ b/tools/tracing/rtla/src/timerlat_bpf.c @@ -7,6 +7,10 @@ static struct timerlat_bpf *bpf; +/* BPF object and program for action program */ +static struct bpf_object *obj; +static struct bpf_program *prog; + /* * timerlat_bpf_init - load and initialize BPF program to collect timerlat data */ @@ -96,6 +100,11 @@ void timerlat_bpf_detach(void) void timerlat_bpf_destroy(void) { timerlat_bpf__destroy(bpf); + bpf = NULL; + if (obj) + bpf_object__close(obj); + obj = NULL; + prog = NULL; } static int handle_rb_event(void *ctx, void *data, size_t data_sz) @@ -190,4 +199,48 @@ int timerlat_bpf_get_summary_value(enum summary_field key, bpf->maps.summary_user, key, value_irq, value_thread, value_user, cpus); } + +/* + * timerlat_load_bpf_action_program - load and register a BPF action program + */ +int timerlat_load_bpf_action_program(const char *program_path) +{ + int err; + + obj = bpf_object__open_file(program_path, NULL); + if (!obj) { + err_msg("Failed to open BPF action program: %s\n", program_path); + goto out_err; + } + + err = bpf_object__load(obj); + if (err) { + err_msg("Failed to load BPF action program: %s\n", program_path); + goto out_obj_err; + } + + prog = bpf_object__find_program_by_name(obj, "action_handler"); + if (!prog) { + err_msg("BPF action program must have 'action_handler' function: %s\n", + program_path); + goto out_obj_err; + } + + err = timerlat_bpf_set_action(prog); + if (err) { + err_msg("Failed to register BPF action program: %s\n", program_path); + goto out_prog_err; + } + + return 0; + +out_prog_err: + prog = NULL; +out_obj_err: + bpf_object__close(obj); + obj = NULL; +out_err: + return 1; +} + #endif /* HAVE_BPF_SKEL */ diff --git a/tools/tracing/rtla/src/timerlat_bpf.h b/tools/tracing/rtla/src/timerlat_bpf.h index b5009092c7a3..169abeaf4363 100644 --- a/tools/tracing/rtla/src/timerlat_bpf.h +++ b/tools/tracing/rtla/src/timerlat_bpf.h @@ -30,7 +30,7 @@ int timerlat_bpf_get_summary_value(enum summary_field key, long long *value_thread, long long *value_user, int cpus); - +int timerlat_load_bpf_action_program(const char *program_path); static inline int have_libbpf_support(void) { return 1; } #else static inline int timerlat_bpf_init(struct timerlat_params *params) @@ -58,6 +58,10 @@ static inline int timerlat_bpf_get_summary_value(enum summary_field key, { return -1; } +static inline int timerlat_load_bpf_action_program(const char *program_path) +{ + return -1; +} static inline int have_libbpf_support(void) { return 0; } #endif /* HAVE_BPF_SKEL */ #endif /* __bpf__ */ diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 2a5c543217ba..ec43e6fda743 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -746,6 +746,7 @@ static void timerlat_hist_usage(void) " --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency", " --on-threshold : define action to be executed at latency threshold, multiple are allowed", " --on-end : define action to be executed at measurement end, multiple are allowed", + " --bpf-action : load and execute BPF program when latency threshold is exceeded", NULL, }; @@ -825,6 +826,7 @@ static struct common_params {"deepest-idle-state", required_argument, 0, '\4'}, {"on-threshold", required_argument, 0, '\5'}, {"on-end", required_argument, 0, '\6'}, + {"bpf-action", required_argument, 0, '\7'}, {0, 0, 0, 0} }; @@ -1006,6 +1008,9 @@ static struct common_params if (retval) fatal("Invalid action %s", optarg); break; + case '\7': + params->bpf_action_program = optarg; + break; default: fatal("Invalid option"); } diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 9ed8b931552f..af20b3eee472 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -518,6 +518,7 @@ static void timerlat_top_usage(void) " --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency", " --on-threshold : define action to be executed at latency threshold, multiple are allowed", " --on-end: define action to be executed at measurement end, multiple are allowed", + " --bpf-action : load and execute BPF program when latency threshold is exceeded", NULL, }; @@ -589,6 +590,7 @@ static struct common_params {"deepest-idle-state", required_argument, 0, '8'}, {"on-threshold", required_argument, 0, '9'}, {"on-end", required_argument, 0, '\1'}, + {"bpf-action", required_argument, 0, '\2'}, {0, 0, 0, 0} }; @@ -756,6 +758,9 @@ static struct common_params if (retval) fatal("Invalid action %s", optarg); break; + case '\2': + params->bpf_action_program = optarg; + break; default: fatal("Invalid option"); } -- cgit v1.2.3 From 0304a3b7ec9a207637ab6f360a41af5fb25e1f44 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 26 Nov 2025 15:42:01 +0100 Subject: rtla/timerlat: Add example for BPF action program Add an example BPF action program that prints the measured latency to the tracefs buffer via bpf_printk(). A new Makefile target, "examples", is added to build the example. In addition, "sample/" subfolder is renamed to "example". If BPF skeleton support is unavailable or disabled, a warning will be displayed when building the BPF action program example. Link: https://lore.kernel.org/r/20251126144205.331954-4-tglozar@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/Makefile | 9 ++- tools/tracing/rtla/example/timerlat_bpf_action.c | 16 +++++ tools/tracing/rtla/example/timerlat_load.py | 78 ++++++++++++++++++++++++ tools/tracing/rtla/sample/timerlat_load.py | 78 ------------------------ 4 files changed, 102 insertions(+), 79 deletions(-) create mode 100644 tools/tracing/rtla/example/timerlat_bpf_action.c create mode 100644 tools/tracing/rtla/example/timerlat_load.py delete mode 100644 tools/tracing/rtla/sample/timerlat_load.py (limited to 'tools') diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 746ccf2f5808..5f1529ce3693 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -73,9 +73,15 @@ src/timerlat.bpf.o: src/timerlat.bpf.c src/timerlat.skel.h: src/timerlat.bpf.o $(QUIET_GENSKEL)$(SYSTEM_BPFTOOL) gen skeleton $< > $@ + +example/timerlat_bpf_action.o: example/timerlat_bpf_action.c + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@ else src/timerlat.skel.h: $(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h + +example/timerlat_bpf_action.o: example/timerlat_bpf_action.c + $(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o" endif $(RTLA): $(RTLA_IN) @@ -96,7 +102,8 @@ clean: doc_clean fixdep-clean $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-* $(Q)rm -rf feature - $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h + $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o check: $(RTLA) RTLA=$(RTLA) prove -o -f tests/ +examples: example/timerlat_bpf_action.o .PHONY: FORCE clean check diff --git a/tools/tracing/rtla/example/timerlat_bpf_action.c b/tools/tracing/rtla/example/timerlat_bpf_action.c new file mode 100644 index 000000000000..ac1be049a848 --- /dev/null +++ b/tools/tracing/rtla/example/timerlat_bpf_action.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +struct trace_event_raw_timerlat_sample { + unsigned long long timer_latency; +} __attribute__((preserve_access_index)); + +SEC("tp/timerlat_action") +int action_handler(struct trace_event_raw_timerlat_sample *tp_args) +{ + bpf_printk("Latency: %lld\n", tp_args->timer_latency); + return 0; +} diff --git a/tools/tracing/rtla/example/timerlat_load.py b/tools/tracing/rtla/example/timerlat_load.py new file mode 100644 index 000000000000..a819c3588073 --- /dev/null +++ b/tools/tracing/rtla/example/timerlat_load.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (C) 2024 Red Hat, Inc. Daniel Bristot de Oliveira +# +# This is a sample code about how to use timerlat's timer by any workload +# so rtla can measure and provide auto-analysis for the overall latency (IOW +# the response time) for a task. +# +# Before running it, you need to dispatch timerlat with -U option in a terminal. +# Then # run this script pinned to a CPU on another terminal. For example: +# +# timerlat_load.py 1 -p 95 +# +# The "Timerlat IRQ" is the IRQ latency, The thread latency is the latency +# for the python process to get the CPU. The Ret from user Timer Latency is +# the overall latency. In other words, it is the response time for that +# activation. +# +# This is just an example, the load is reading 20MB of data from /dev/full +# It is in python because it is easy to read :-) + +import argparse +import sys +import os + +parser = argparse.ArgumentParser(description='user-space timerlat thread in Python') +parser.add_argument("cpu", type=int, help='CPU to run timerlat thread') +parser.add_argument("-p", "--prio", type=int, help='FIFO priority') +args = parser.parse_args() + +try: + affinity_mask = {args.cpu} + os.sched_setaffinity(0, affinity_mask) +except Exception as e: + print(f"Error setting affinity: {e}") + sys.exit(1) + +if args.prio: + try: + param = os.sched_param(args.prio) + os.sched_setscheduler(0, os.SCHED_FIFO, param) + except Exception as e: + print(f"Error setting priority: {e}") + sys.exit(1) + +try: + timerlat_path = f"/sys/kernel/tracing/osnoise/per_cpu/cpu{args.cpu}/timerlat_fd" + timerlat_fd = open(timerlat_path, 'r') +except PermissionError: + print("Permission denied. Please check your access rights.") + sys.exit(1) +except OSError: + print("Error opening timerlat fd, did you run timerlat -U?") + sys.exit(1) + +try: + data_fd = open("/dev/full", 'r') +except Exception as e: + print(f"Error opening data fd: {e}") + sys.exit(1) + +while True: + try: + timerlat_fd.read(1) + data_fd.read(20 * 1024 * 1024) + except KeyboardInterrupt: + print("Leaving") + break + except IOError as e: + print(f"I/O error occurred: {e}") + break + except Exception as e: + print(f"Unexpected error: {e}") + break + +timerlat_fd.close() +data_fd.close() diff --git a/tools/tracing/rtla/sample/timerlat_load.py b/tools/tracing/rtla/sample/timerlat_load.py deleted file mode 100644 index a819c3588073..000000000000 --- a/tools/tracing/rtla/sample/timerlat_load.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) 2024 Red Hat, Inc. Daniel Bristot de Oliveira -# -# This is a sample code about how to use timerlat's timer by any workload -# so rtla can measure and provide auto-analysis for the overall latency (IOW -# the response time) for a task. -# -# Before running it, you need to dispatch timerlat with -U option in a terminal. -# Then # run this script pinned to a CPU on another terminal. For example: -# -# timerlat_load.py 1 -p 95 -# -# The "Timerlat IRQ" is the IRQ latency, The thread latency is the latency -# for the python process to get the CPU. The Ret from user Timer Latency is -# the overall latency. In other words, it is the response time for that -# activation. -# -# This is just an example, the load is reading 20MB of data from /dev/full -# It is in python because it is easy to read :-) - -import argparse -import sys -import os - -parser = argparse.ArgumentParser(description='user-space timerlat thread in Python') -parser.add_argument("cpu", type=int, help='CPU to run timerlat thread') -parser.add_argument("-p", "--prio", type=int, help='FIFO priority') -args = parser.parse_args() - -try: - affinity_mask = {args.cpu} - os.sched_setaffinity(0, affinity_mask) -except Exception as e: - print(f"Error setting affinity: {e}") - sys.exit(1) - -if args.prio: - try: - param = os.sched_param(args.prio) - os.sched_setscheduler(0, os.SCHED_FIFO, param) - except Exception as e: - print(f"Error setting priority: {e}") - sys.exit(1) - -try: - timerlat_path = f"/sys/kernel/tracing/osnoise/per_cpu/cpu{args.cpu}/timerlat_fd" - timerlat_fd = open(timerlat_path, 'r') -except PermissionError: - print("Permission denied. Please check your access rights.") - sys.exit(1) -except OSError: - print("Error opening timerlat fd, did you run timerlat -U?") - sys.exit(1) - -try: - data_fd = open("/dev/full", 'r') -except Exception as e: - print(f"Error opening data fd: {e}") - sys.exit(1) - -while True: - try: - timerlat_fd.read(1) - data_fd.read(20 * 1024 * 1024) - except KeyboardInterrupt: - print("Leaving") - break - except IOError as e: - print(f"I/O error occurred: {e}") - break - except Exception as e: - print(f"Unexpected error: {e}") - break - -timerlat_fd.close() -data_fd.close() -- cgit v1.2.3 From 5525aebd4e0c6f7d92ec1cb074218bbcf3d46f13 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 26 Nov 2025 15:42:02 +0100 Subject: rtla/tests: Test BPF action program Add a test that implements a BPF program writing to a test map, which is attached to RTLA via --bpf-action to be executed on theshold overflow. A combination of --on-threshold shell with bpftool (which is always present if BPF support is enabled) is used to check whether the BPF program has executed successfully. Suggested-by: Crystal Wood Link: https://lore.kernel.org/r/20251126144205.331954-5-tglozar@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/Makefile | 10 ++++++++-- tools/tracing/rtla/tests/bpf/bpf_action_map.c | 25 +++++++++++++++++++++++++ tools/tracing/rtla/tests/timerlat.t | 15 +++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 tools/tracing/rtla/tests/bpf/bpf_action_map.c (limited to 'tools') diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 5f1529ce3693..aef814b639b7 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -76,12 +76,18 @@ src/timerlat.skel.h: src/timerlat.bpf.o example/timerlat_bpf_action.o: example/timerlat_bpf_action.c $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@ + +tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@ else src/timerlat.skel.h: $(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h example/timerlat_bpf_action.o: example/timerlat_bpf_action.c $(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o" + +tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c + $(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o" endif $(RTLA): $(RTLA_IN) @@ -103,7 +109,7 @@ clean: doc_clean fixdep-clean $(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-* $(Q)rm -rf feature $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o -check: $(RTLA) - RTLA=$(RTLA) prove -o -f tests/ +check: $(RTLA) tests/bpf/bpf_action_map.o + RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f tests/ examples: example/timerlat_bpf_action.o .PHONY: FORCE clean check diff --git a/tools/tracing/rtla/tests/bpf/bpf_action_map.c b/tools/tracing/rtla/tests/bpf/bpf_action_map.c new file mode 100644 index 000000000000..1686e0b858e6 --- /dev/null +++ b/tools/tracing/rtla/tests/bpf/bpf_action_map.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, unsigned int); + __type(value, unsigned long long); +} rtla_test_map SEC(".maps"); + +struct trace_event_raw_timerlat_sample; + +SEC("tp/timerlat_action") +int action_handler(struct trace_event_raw_timerlat_sample *tp_args) +{ + unsigned int key = 0; + unsigned long long value = 42; + + bpf_map_update_elem(&rtla_test_map, &key, &value, BPF_ANY); + + return 0; +} diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t index bbaa1897d8a8..fd4935fd7b49 100644 --- a/tools/tracing/rtla/tests/timerlat.t +++ b/tools/tracing/rtla/tests/timerlat.t @@ -67,6 +67,21 @@ check "hist with trace output at end" \ "timerlat hist -d 1s --on-end trace" 0 "^ Saving trace to timerlat_trace.txt$" check "top with trace output at end" \ "timerlat top -d 1s --on-end trace" 0 "^ Saving trace to timerlat_trace.txt$" + +# BPF action program tests +if [ "$option" -eq 0 ] +then + # Test BPF action program properly in BPF mode + [ -z "$BPFTOOL" ] && BPFTOOL=bpftool + check "hist with BPF action program (BPF mode)" \ + "timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \ + 2 '"value": 42' +else + # Test BPF action program failure in non-BPF mode + check "hist with BPF action program (non-BPF mode)" \ + "timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o" \ + 1 "BPF actions are not supported in tracefs-only mode" +fi done test_end -- cgit v1.2.3 From fbb8ed6682f84e6e27c798a3117b0bcd4d0623c4 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 26 Nov 2025 15:42:03 +0100 Subject: rtla/tests: Run Test::Harness in verbose mode Add -v flag to prove command to also print the names of tests that succeeded, not only those that failed, to allow easier debugging of the test suite. Also, drop printing the option and value to stdout in check_with_osnoise_options, which was a debugging print that was accidentally left in the final commit, and which would be otherwise now visible in make check output, as stdout is no longer suppressed. Suggested-by: Crystal Wood Reviewed-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20251126144205.331954-6-tglozar@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/Makefile | 2 +- tools/tracing/rtla/tests/engine.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index aef814b639b7..2701256abaf3 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -110,6 +110,6 @@ clean: doc_clean fixdep-clean $(Q)rm -rf feature $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o check: $(RTLA) tests/bpf/bpf_action_map.o - RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f tests/ + RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f -v tests/ examples: example/timerlat_bpf_action.o .PHONY: FORCE clean check diff --git a/tools/tracing/rtla/tests/engine.sh b/tools/tracing/rtla/tests/engine.sh index c7de3d6ed6a8..ed261e07c6d9 100644 --- a/tools/tracing/rtla/tests/engine.sh +++ b/tools/tracing/rtla/tests/engine.sh @@ -105,7 +105,6 @@ check_with_osnoise_options() { [ "$1" == "" ] && continue option=$(echo $1 | cut -d '=' -f 1) value=$(echo $1 | cut -d '=' -f 2) - echo "option: $option, value: $value" echo "$value" > "/sys/kernel/tracing/osnoise/$option" || return 1 done fi -- cgit v1.2.3 From 850cd24cb6d648262b994b99e189409b21a2c09b Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:40 +0200 Subject: tools/rtla: Add common_parse_options() Each rtla tool duplicates parsing of many common options. This creates maintenance overhead and risks inconsistencies when updating these options. Add common_parse_options() to centralize parsing of options used across all tools. Common options to be migrated in future patches. Changes since v1: - restore opterr Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-1-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 35 ++++++++++++++++++++++++++++++++++ tools/tracing/rtla/src/common.h | 1 + tools/tracing/rtla/src/osnoise_hist.c | 3 +++ tools/tracing/rtla/src/osnoise_top.c | 3 +++ tools/tracing/rtla/src/timerlat_hist.c | 3 +++ tools/tracing/rtla/src/timerlat_top.c | 3 +++ 6 files changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 009a4bce9737..c01de7972bea 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "common.h" struct trace_instance *trace_inst; @@ -38,6 +39,40 @@ static void set_signals(struct common_params *params) } } +/* + * common_parse_options - parse common command line options + * + * @argc: argument count + * @argv: argument vector + * @common: common parameters structure + * + * Parse command line options that are common to all rtla tools. + * + * Returns: non zero if a common option was parsed, or 0 + * if the option should be handled by tool-specific parsing. + */ +int common_parse_options(int argc, char **argv, struct common_params *common) +{ + int saved_state = optind; + int c; + + static struct option long_options[] = { + {0, 0, 0, 0} + }; + + opterr = 0; + c = getopt_long(argc, argv, "", long_options, NULL); + opterr = 1; + + switch (c) { + default: + optind = saved_state; + return 0; + } + + return c; +} + /* * common_apply_config - apply common configs to the initialized tool */ diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h index c48c9bfd20e3..ef17ea5be540 100644 --- a/tools/tracing/rtla/src/common.h +++ b/tools/tracing/rtla/src/common.h @@ -156,6 +156,7 @@ int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us); int osnoise_set_stop_total_us(struct osnoise_context *context, long long stop_total_us); +int common_parse_options(int argc, char **argv, struct common_params *common); int common_apply_config(struct osnoise_tool *tool, struct common_params *params); int top_main_loop(struct osnoise_tool *tool); int hist_main_loop(struct osnoise_tool *tool); diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 372128db9e4a..d5c78e07bf60 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -512,6 +512,9 @@ static struct common_params {0, 0, 0, 0} }; + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, NULL); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 1db1d946b600..2bb154da1139 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -363,6 +363,9 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) {0, 0, 0, 0} }; + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, NULL); diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index ec43e6fda743..a17111b6aa6d 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -830,6 +830,9 @@ static struct common_params {0, 0, 0, 0} }; + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index af20b3eee472..b14a785361b1 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -594,6 +594,9 @@ static struct common_params {0, 0, 0, 0} }; + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); -- cgit v1.2.3 From 28dc445919bf4019ffdaf65d94bf26ace25d4a5e Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:41 +0200 Subject: tools/rtla: Consolidate -c/--cpus option parsing Each rtla tool duplicates parsing of -c/--cpus. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-2-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 8 +++++++- tools/tracing/rtla/src/osnoise_hist.c | 9 +-------- tools/tracing/rtla/src/osnoise_top.c | 9 +-------- tools/tracing/rtla/src/timerlat_hist.c | 9 +-------- tools/tracing/rtla/src/timerlat_top.c | 9 +-------- 5 files changed, 11 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index c01de7972bea..1b9e0108b0ba 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -57,14 +57,20 @@ int common_parse_options(int argc, char **argv, struct common_params *common) int c; static struct option long_options[] = { + {"cpus", required_argument, 0, 'c'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "", long_options, NULL); + c = getopt_long(argc, argv, "c:", long_options, NULL); opterr = 1; switch (c) { + case 'c': + if (parse_cpu_set(optarg, &common->monitored_cpus)) + fatal("Invalid -c cpu list"); + common->cpus = optarg; + break; default: optind = saved_state; return 0; diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index d5c78e07bf60..443cb7f0e3a2 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -485,7 +485,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"cpus", required_argument, 0, 'c'}, {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, @@ -515,7 +514,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -541,12 +540,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; - break; case 'C': params->common.cgroup = 1; params->common.cgroup_name = parse_optional_arg(argc, argv); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 2bb154da1139..8ba0757225b3 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cpus", required_argument, 0, 'c'}, {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, @@ -366,7 +365,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -385,12 +384,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (!trace_output) trace_output = "osnoise_trace.txt"; - break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; break; case 'C': params->common.cgroup = 1; diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index a17111b6aa6d..3571aa8233c1 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -791,7 +791,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cpus", required_argument, 0, 'c'}, {"cgroup", optional_argument, 0, 'C'}, {"bucket-size", required_argument, 0, 'b'}, {"debug", no_argument, 0, 'D'}, @@ -833,7 +832,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -855,12 +854,6 @@ static struct common_params if (!trace_output) trace_output = "timerlat_trace.txt"; - break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; break; case 'C': params->common.cgroup = 1; diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index b14a785361b1..b0a048cded29 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -561,7 +561,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cpus", required_argument, 0, 'c'}, {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, @@ -597,7 +596,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -634,12 +633,6 @@ static struct common_params /* set aa_only to avoid parsing the trace */ params->common.aa_only = 1; break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; - break; case 'C': params->common.cgroup = 1; params->common.cgroup_name = optarg; -- cgit v1.2.3 From edb23c8372222395fd4e4297240cbe2191425dbf Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:42 +0200 Subject: tools/rtla: Consolidate -C/--cgroup option parsing Each rtla tool duplicates parsing of -C/--cgroup. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-3-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 7 ++++++- tools/tracing/rtla/src/osnoise_hist.c | 7 +------ tools/tracing/rtla/src/osnoise_top.c | 7 +------ tools/tracing/rtla/src/timerlat_hist.c | 7 +------ tools/tracing/rtla/src/timerlat_top.c | 7 +------ 5 files changed, 10 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 1b9e0108b0ba..3400836f66ef 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -58,11 +58,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common) static struct option long_options[] = { {"cpus", required_argument, 0, 'c'}, + {"cgroup", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "c:", long_options, NULL); + c = getopt_long(argc, argv, "c:C::", long_options, NULL); opterr = 1; switch (c) { @@ -71,6 +72,10 @@ int common_parse_options(int argc, char **argv, struct common_params *common) fatal("Invalid -c cpu list"); common->cpus = optarg; break; + case 'C': + common->cgroup = 1; + common->cgroup_name = parse_optional_arg(argc, argv); + break; default: optind = saved_state; return 0; diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 443cb7f0e3a2..bcd4b4c96354 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -485,7 +485,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"house-keeping", required_argument, 0, 'H'}, @@ -514,7 +513,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -540,10 +539,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = parse_optional_arg(argc, argv); - break; case 'D': config_debug = 1; break; diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 8ba0757225b3..2799dd75a4e2 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, @@ -365,7 +364,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -384,10 +383,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (!trace_output) trace_output = "osnoise_trace.txt"; - break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = parse_optional_arg(argc, argv); break; case 'D': config_debug = 1; diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 3571aa8233c1..64c1dbb1fccc 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -791,7 +791,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cgroup", optional_argument, 0, 'C'}, {"bucket-size", required_argument, 0, 'b'}, {"debug", no_argument, 0, 'D'}, {"entries", required_argument, 0, 'E'}, @@ -832,7 +831,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -854,10 +853,6 @@ static struct common_params if (!trace_output) trace_output = "timerlat_trace.txt"; - break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = parse_optional_arg(argc, argv); break; case 'b': params->common.hist.bucket_size = get_llong_from_str(optarg); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index b0a048cded29..bdc8bf265836 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -561,7 +561,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, @@ -596,7 +595,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -633,10 +632,6 @@ static struct common_params /* set aa_only to avoid parsing the trace */ params->common.aa_only = 1; break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = optarg; - break; case 'D': config_debug = 1; break; -- cgit v1.2.3 From fd788c49a90328f5b2edaa87aa5af18648ade718 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:43 +0200 Subject: tools/rtla: Consolidate -D/--debug option parsing Each rtla tool duplicates parsing of -D/--debug. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-4-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 6 +++++- tools/tracing/rtla/src/osnoise_hist.c | 6 +----- tools/tracing/rtla/src/osnoise_top.c | 6 +----- tools/tracing/rtla/src/timerlat_hist.c | 6 +----- tools/tracing/rtla/src/timerlat_top.c | 6 +----- 5 files changed, 9 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 3400836f66ef..f71cf7c7f4e3 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -59,11 +59,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common) static struct option long_options[] = { {"cpus", required_argument, 0, 'c'}, {"cgroup", optional_argument, 0, 'C'}, + {"debug", no_argument, 0, 'D'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "c:C::", long_options, NULL); + c = getopt_long(argc, argv, "c:C::D", long_options, NULL); opterr = 1; switch (c) { @@ -76,6 +77,9 @@ int common_parse_options(int argc, char **argv, struct common_params *common) common->cgroup = 1; common->cgroup_name = parse_optional_arg(argc, argv); break; + case 'D': + config_debug = 1; + break; default: optind = saved_state; return 0; diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index bcd4b4c96354..c9422b596622 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -485,7 +485,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, @@ -513,7 +512,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:b:d:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -539,9 +538,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'D': - config_debug = 1; - break; case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 2799dd75a4e2..8d49042d10f0 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, {"house-keeping", required_argument, 0, 'H'}, @@ -364,7 +363,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:d:e:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -383,9 +382,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (!trace_output) trace_output = "osnoise_trace.txt"; - break; - case 'D': - config_debug = 1; break; case 'd': params->common.duration = parse_seconds_duration(optarg); diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 64c1dbb1fccc..c08f628047c1 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -792,7 +792,6 @@ static struct common_params static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, - {"debug", no_argument, 0, 'D'}, {"entries", required_argument, 0, 'E'}, {"duration", required_argument, 0, 'd'}, {"house-keeping", required_argument, 0, 'H'}, @@ -831,7 +830,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:b:d:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -860,9 +859,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'D': - config_debug = 1; - break; case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index bdc8bf265836..7c0a3f582273 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -561,7 +561,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, {"help", no_argument, 0, 'h'}, @@ -595,7 +594,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:d:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -632,9 +631,6 @@ static struct common_params /* set aa_only to avoid parsing the trace */ params->common.aa_only = 1; break; - case 'D': - config_debug = 1; - break; case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) -- cgit v1.2.3 From 76975581fb0eba03820fe312094981c995c225f9 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:44 +0200 Subject: tools/rtla: Consolidate -d/--duration option parsing Each rtla tool duplicates parsing of -d/--duration. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-5-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 8 +++++++- tools/tracing/rtla/src/osnoise_hist.c | 8 +------- tools/tracing/rtla/src/osnoise_top.c | 8 +------- tools/tracing/rtla/src/timerlat_hist.c | 8 +------- tools/tracing/rtla/src/timerlat_top.c | 8 +------- 5 files changed, 11 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index f71cf7c7f4e3..0776f1568d23 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -60,11 +60,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common) {"cpus", required_argument, 0, 'c'}, {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, + {"duration", required_argument, 0, 'd'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "c:C::D", long_options, NULL); + c = getopt_long(argc, argv, "c:C::Dd:", long_options, NULL); opterr = 1; switch (c) { @@ -80,6 +81,11 @@ int common_parse_options(int argc, char **argv, struct common_params *common) case 'D': config_debug = 1; break; + case 'd': + common->duration = parse_seconds_duration(optarg); + if (!common->duration) + fatal("Invalid -d duration"); + break; default: optind = saved_state; return 0; diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index c9422b596622..f34c88fd57e2 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -485,7 +485,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"duration", required_argument, 0, 'd'}, {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, @@ -512,7 +511,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:d:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:b:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -538,11 +537,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -D duration"); - break; case 'e': tevent = trace_event_alloc(optarg); if (!tevent) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 8d49042d10f0..695c6ecf0098 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, @@ -363,7 +362,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:d:e:hH:p:P:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:e:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -383,11 +382,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) trace_output = "osnoise_trace.txt"; break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -d duration"); - break; case 'e': tevent = trace_event_alloc(optarg); if (!tevent) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index c08f628047c1..d625dbe44676 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -793,7 +793,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"duration", required_argument, 0, 'd'}, {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"irq", required_argument, 0, 'i'}, @@ -830,7 +829,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:d:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:b:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -859,11 +858,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -D duration"); - break; case 'e': tevent = trace_event_alloc(optarg); if (!tevent) diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 7c0a3f582273..95e949f49cbd 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -561,7 +561,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, {"help", no_argument, 0, 'h'}, {"house-keeping", required_argument, 0, 'H'}, @@ -594,7 +593,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:d:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -631,11 +630,6 @@ static struct common_params /* set aa_only to avoid parsing the trace */ params->common.aa_only = 1; break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -d duration"); - break; case 'e': tevent = trace_event_alloc(optarg); if (!tevent) -- cgit v1.2.3 From c93c25fca5ab3c27b42f1f941871209573c0b41b Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:45 +0200 Subject: tools/rtla: Consolidate -e/--event option parsing Each rtla tool duplicates parsing of -e/--event. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-6-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 13 ++++++++++++- tools/tracing/rtla/src/osnoise_hist.c | 14 +------------- tools/tracing/rtla/src/osnoise_top.c | 14 +------------- tools/tracing/rtla/src/timerlat_hist.c | 14 +------------- tools/tracing/rtla/src/timerlat_top.c | 13 +------------ 5 files changed, 16 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 0776f1568d23..fbd38d80f1ac 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -53,6 +53,7 @@ static void set_signals(struct common_params *params) */ int common_parse_options(int argc, char **argv, struct common_params *common) { + struct trace_events *tevent; int saved_state = optind; int c; @@ -61,11 +62,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common) {"cgroup", optional_argument, 0, 'C'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, + {"event", required_argument, 0, 'e'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "c:C::Dd:", long_options, NULL); + c = getopt_long(argc, argv, "c:C::Dd:e:", long_options, NULL); opterr = 1; switch (c) { @@ -86,6 +88,15 @@ int common_parse_options(int argc, char **argv, struct common_params *common) if (!common->duration) fatal("Invalid -d duration"); break; + case 'e': + tevent = trace_event_alloc(optarg); + if (!tevent) + fatal("Error alloc trace event"); + + if (common->events) + tevent->next = common->events; + common->events = tevent; + break; default: optind = saved_state; return 0; diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index f34c88fd57e2..8b3eab6092bb 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -463,7 +463,6 @@ static struct common_params *osnoise_hist_parse_args(int argc, char *argv[]) { struct osnoise_params *params; - struct trace_events *tevent; int retval; int c; char *trace_output = NULL; @@ -493,7 +492,6 @@ static struct common_params {"stop", required_argument, 0, 's'}, {"stop-total", required_argument, 0, 'S'}, {"trace", optional_argument, 0, 't'}, - {"event", required_argument, 0, 'e'}, {"threshold", required_argument, 0, 'T'}, {"no-header", no_argument, 0, '0'}, {"no-summary", no_argument, 0, '1'}, @@ -511,7 +509,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:b:E:hH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -537,16 +535,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - - params->common.events = tevent; - break; case 'E': params->common.hist.entries = get_llong_from_str(optarg); if (params->common.hist.entries < 10 || diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 695c6ecf0098..47aac00e2848 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -315,7 +315,6 @@ static void osnoise_top_usage(struct osnoise_params *params) struct common_params *osnoise_top_parse_args(int argc, char **argv) { struct osnoise_params *params; - struct trace_events *tevent; int retval; int c; char *trace_output = NULL; @@ -339,7 +338,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"event", required_argument, 0, 'e'}, {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, @@ -362,7 +360,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:e:hH:p:P:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -381,16 +379,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (!trace_output) trace_output = "osnoise_trace.txt"; - break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - params->common.events = tevent; - break; case 'h': case '?': diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index d625dbe44676..32424e2bd34a 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -761,7 +761,6 @@ static struct common_params *timerlat_hist_parse_args(int argc, char *argv[]) { struct timerlat_params *params; - struct trace_events *tevent; int auto_thresh; int retval; int c; @@ -805,7 +804,6 @@ static struct common_params {"user-threads", no_argument, 0, 'u'}, {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, - {"event", required_argument, 0, 'e'}, {"no-irq", no_argument, 0, '0'}, {"no-thread", no_argument, 0, '1'}, {"no-header", no_argument, 0, '2'}, @@ -829,7 +827,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:b:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -858,16 +856,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - - params->common.events = tevent; - break; case 'E': params->common.hist.entries = get_llong_from_str(optarg); if (params->common.hist.entries < 10 || diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 95e949f49cbd..928d887e0c2e 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -533,7 +533,6 @@ static struct common_params *timerlat_top_parse_args(int argc, char **argv) { struct timerlat_params *params; - struct trace_events *tevent; long long auto_thresh; int retval; int c; @@ -561,7 +560,6 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"event", required_argument, 0, 'e'}, {"help", no_argument, 0, 'h'}, {"house-keeping", required_argument, 0, 'H'}, {"irq", required_argument, 0, 'i'}, @@ -593,7 +591,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -630,15 +628,6 @@ static struct common_params /* set aa_only to avoid parsing the trace */ params->common.aa_only = 1; break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - params->common.events = tevent; - break; case 'h': case '?': timerlat_top_usage(); -- cgit v1.2.3 From 5cc90b14ee54591b890ad026ad5e01b2960c3a31 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:46 +0200 Subject: tools/rtla: Consolidate -P/--priority option parsing Each rtla tool duplicates parsing of -P/--priority. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-7-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 8 +++++++- tools/tracing/rtla/src/osnoise_hist.c | 9 +-------- tools/tracing/rtla/src/osnoise_top.c | 9 +-------- tools/tracing/rtla/src/timerlat_hist.c | 9 +-------- tools/tracing/rtla/src/timerlat_top.c | 9 +-------- 5 files changed, 11 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index fbd38d80f1ac..90f1bbb7e189 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -63,11 +63,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common) {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, + {"priority", required_argument, 0, 'P'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "c:C::Dd:e:", long_options, NULL); + c = getopt_long(argc, argv, "c:C::Dd:e:P:", long_options, NULL); opterr = 1; switch (c) { @@ -97,6 +98,11 @@ int common_parse_options(int argc, char **argv, struct common_params *common) tevent->next = common->events; common->events = tevent; break; + case 'P': + if (parse_prio(optarg, &common->sched_param) == -1) + fatal("Invalid -P priority"); + common->set_sched = 1; + break; default: optind = saved_state; return 0; diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 8b3eab6092bb..6e66726766a1 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -487,7 +487,6 @@ static struct common_params {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"runtime", required_argument, 0, 'r'}, {"stop", required_argument, 0, 's'}, {"stop-total", required_argument, 0, 'S'}, @@ -509,7 +508,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:E:hH:p:P:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:b:E:hH:p:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -556,12 +555,6 @@ static struct common_params if (params->period > 10000000) fatal("Period longer than 10 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 'r': params->runtime = get_llong_from_str(optarg); if (params->runtime < 100) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 47aac00e2848..7ac992ec7439 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -341,7 +341,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"quiet", no_argument, 0, 'q'}, {"runtime", required_argument, 0, 'r'}, {"stop", required_argument, 0, 's'}, @@ -360,7 +359,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:hH:p:P:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:hH:p:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -395,12 +394,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (params->period > 10000000) fatal("Period longer than 10 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 'q': params->common.quiet = 1; break; diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 32424e2bd34a..99b416ccfc5b 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -797,7 +797,6 @@ static struct common_params {"irq", required_argument, 0, 'i'}, {"nano", no_argument, 0, 'n'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"stack", required_argument, 0, 's'}, {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, @@ -827,7 +826,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:b:E:hH:i:knp:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -886,12 +885,6 @@ static struct common_params if (params->timerlat_period_us > 1000000) fatal("Period longer than 1 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 's': params->print_stack = get_llong_from_str(optarg); break; diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 928d887e0c2e..027aad1b639f 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -565,7 +565,6 @@ static struct common_params {"irq", required_argument, 0, 'i'}, {"nano", no_argument, 0, 'n'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"quiet", no_argument, 0, 'q'}, {"stack", required_argument, 0, 's'}, {"thread", required_argument, 0, 'T'}, @@ -591,7 +590,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:hH:i:knp:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -652,12 +651,6 @@ static struct common_params if (params->timerlat_period_us > 1000000) fatal("Period longer than 1 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 'q': params->common.quiet = 1; break; -- cgit v1.2.3 From 0576be469ef18a9f3460f6f207183033ae8b90c5 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 9 Dec 2025 12:00:47 +0200 Subject: tools/rtla: Consolidate -H/--house-keeping option parsing Each rtla tool duplicates parsing of -H/--house-keeping. Migrate the option parsing from individual tools to the common_parse_options(). Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251209100047.2692515-8-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 8 +++++++- tools/tracing/rtla/src/osnoise_hist.c | 9 +-------- tools/tracing/rtla/src/osnoise_top.c | 9 +-------- tools/tracing/rtla/src/timerlat_hist.c | 9 +-------- tools/tracing/rtla/src/timerlat_top.c | 9 +-------- 5 files changed, 11 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 90f1bbb7e189..6f64c1fc1b62 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -63,12 +63,13 @@ int common_parse_options(int argc, char **argv, struct common_params *common) {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, {"event", required_argument, 0, 'e'}, + {"house-keeping", required_argument, 0, 'H'}, {"priority", required_argument, 0, 'P'}, {0, 0, 0, 0} }; opterr = 0; - c = getopt_long(argc, argv, "c:C::Dd:e:P:", long_options, NULL); + c = getopt_long(argc, argv, "c:C::Dd:e:H:P:", long_options, NULL); opterr = 1; switch (c) { @@ -98,6 +99,11 @@ int common_parse_options(int argc, char **argv, struct common_params *common) tevent->next = common->events; common->events = tevent; break; + case 'H': + common->hk_cpus = 1; + if (parse_cpu_set(optarg, &common->hk_cpu_set)) + fatal("Error parsing house keeping CPUs"); + break; case 'P': if (parse_prio(optarg, &common->sched_param) == -1) fatal("Invalid -P priority"); diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 6e66726766a1..705c73d55102 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -484,7 +484,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, {"runtime", required_argument, 0, 'r'}, @@ -508,7 +507,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:E:hH:p:r:s:S:t::T:01234:5:6:7:", + c = getopt_long(argc, argv, "a:b:E:hp:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -544,12 +543,6 @@ static struct common_params case '?': osnoise_hist_usage(); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'p': params->period = get_llong_from_str(optarg); if (params->period > 10000000) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 7ac992ec7439..d54d47947fb4 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -338,7 +338,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, {"quiet", no_argument, 0, 'q'}, @@ -359,7 +358,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:hH:p:qr:s:S:t::T:0:1:2:3:", + c = getopt_long(argc, argv, "a:hp:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -383,12 +382,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) case '?': osnoise_top_usage(params); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'p': params->period = get_llong_from_str(optarg); if (params->period > 10000000) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 99b416ccfc5b..4e8c38a61197 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -792,7 +792,6 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"irq", required_argument, 0, 'i'}, {"nano", no_argument, 0, 'n'}, @@ -826,7 +825,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:b:E:hH:i:knp:s:t::T:uU0123456:7:8:9\1\2:\3:", + c = getopt_long(argc, argv, "a:b:E:hi:knp:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -865,12 +864,6 @@ static struct common_params case '?': timerlat_hist_usage(); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'i': params->common.stop_us = get_llong_from_str(optarg); break; diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 027aad1b639f..f5a809344913 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -561,7 +561,6 @@ static struct common_params static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, {"help", no_argument, 0, 'h'}, - {"house-keeping", required_argument, 0, 'H'}, {"irq", required_argument, 0, 'i'}, {"nano", no_argument, 0, 'n'}, {"period", required_argument, 0, 'p'}, @@ -590,7 +589,7 @@ static struct common_params if (common_parse_options(argc, argv, ¶ms->common)) continue; - c = getopt_long(argc, argv, "a:hH:i:knp:qs:t::T:uU0:1:2:345:6:7:", + c = getopt_long(argc, argv, "a:hi:knp:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -631,12 +630,6 @@ static struct common_params case '?': timerlat_top_usage(); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'i': params->common.stop_us = get_llong_from_str(optarg); break; -- cgit v1.2.3 From 2a3a25336b1ba632a0a98249a7d4bbee454065aa Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Wed, 24 Dec 2025 14:50:56 +0200 Subject: tools/rtla: Deduplicate cgroup path opening code Both set_pid_cgroup() and set_comm_cgroup() functions contain identical code for opening the cgroup.procs file. Extract this common code into a new helper function open_cgroup_procs() to reduce code duplication and improve maintainability. Signed-off-by: Costa Shulyupin Link: https://lore.kernel.org/r/20251224125058.1771519-1-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/utils.c | 65 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 9cf5a0098e9a..0b84e02b13df 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -784,27 +784,27 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg) } /* - * set_comm_cgroup - Set cgroup to pid_t pid + * open_cgroup_procs - Open the cgroup.procs file for the given cgroup * - * If cgroup argument is not NULL, the threads will move to the given cgroup. - * Otherwise, the cgroup of the calling, i.e., rtla, thread will be used. + * If cgroup argument is not NULL, the cgroup.procs file for that cgroup + * will be opened. Otherwise, the cgroup of the calling, i.e., rtla, thread + * will be used. * * Supports cgroup v2. * - * Returns 1 on success, 0 otherwise. + * Returns the file descriptor on success, -1 otherwise. */ -int set_pid_cgroup(pid_t pid, const char *cgroup) +static int open_cgroup_procs(const char *cgroup) { char cgroup_path[MAX_PATH - strlen("/cgroup.procs")]; char cgroup_procs[MAX_PATH]; - char pid_str[24]; int retval; int cg_fd; retval = find_mount("cgroup2", cgroup_path, sizeof(cgroup_path)); if (!retval) { err_msg("Did not find cgroupv2 mount point\n"); - return 0; + return -1; } if (!cgroup) { @@ -812,7 +812,7 @@ int set_pid_cgroup(pid_t pid, const char *cgroup) sizeof(cgroup_path) - strlen(cgroup_path)); if (!retval) { err_msg("Did not find self cgroup\n"); - return 0; + return -1; } } else { snprintf(&cgroup_path[strlen(cgroup_path)], @@ -824,6 +824,29 @@ int set_pid_cgroup(pid_t pid, const char *cgroup) debug_msg("Using cgroup path at: %s\n", cgroup_procs); cg_fd = open(cgroup_procs, O_RDWR); + if (cg_fd < 0) + return -1; + + return cg_fd; +} + +/* + * set_pid_cgroup - Set cgroup to pid_t pid + * + * If cgroup argument is not NULL, the threads will move to the given cgroup. + * Otherwise, the cgroup of the calling, i.e., rtla, thread will be used. + * + * Supports cgroup v2. + * + * Returns 1 on success, 0 otherwise. + */ +int set_pid_cgroup(pid_t pid, const char *cgroup) +{ + char pid_str[24]; + int retval; + int cg_fd; + + cg_fd = open_cgroup_procs(cgroup); if (cg_fd < 0) return 0; @@ -853,8 +876,6 @@ int set_pid_cgroup(pid_t pid, const char *cgroup) */ int set_comm_cgroup(const char *comm_prefix, const char *cgroup) { - char cgroup_path[MAX_PATH - strlen("/cgroup.procs")]; - char cgroup_procs[MAX_PATH]; struct dirent *proc_entry; DIR *procfs; int retval; @@ -866,29 +887,7 @@ int set_comm_cgroup(const char *comm_prefix, const char *cgroup) return 0; } - retval = find_mount("cgroup2", cgroup_path, sizeof(cgroup_path)); - if (!retval) { - err_msg("Did not find cgroupv2 mount point\n"); - return 0; - } - - if (!cgroup) { - retval = get_self_cgroup(&cgroup_path[strlen(cgroup_path)], - sizeof(cgroup_path) - strlen(cgroup_path)); - if (!retval) { - err_msg("Did not find self cgroup\n"); - return 0; - } - } else { - snprintf(&cgroup_path[strlen(cgroup_path)], - sizeof(cgroup_path) - strlen(cgroup_path), "%s/", cgroup); - } - - snprintf(cgroup_procs, MAX_PATH, "%s/cgroup.procs", cgroup_path); - - debug_msg("Using cgroup path at: %s\n", cgroup_procs); - - cg_fd = open(cgroup_procs, O_RDWR); + cg_fd = open_cgroup_procs(cgroup); if (cg_fd < 0) return 0; -- cgit v1.2.3 From 648634d17c813b35da775982662e56ea8ce750de Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:39 -0300 Subject: rtla: Introduce for_each_action() helper The for loop to iterate over the list of actions is used in more than one place. To avoid code duplication and improve readability, introduce a for_each_action() helper macro. Replace the open-coded for loops with the new helper. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-4-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/actions.c | 6 ++++-- tools/tracing/rtla/src/actions.h | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index 8945aee58d51..31bc98db9228 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -32,7 +32,9 @@ void actions_destroy(struct actions *self) { /* Free any action-specific data */ - for (struct action *action = self->list; action < self->list + self->len; action++) { + struct action *action; + + for_each_action(self, action) { if (action->type == ACTION_SHELL) free(action->command); if (action->type == ACTION_TRACE_OUTPUT) @@ -223,7 +225,7 @@ actions_perform(struct actions *self) int pid, retval; const struct action *action; - for (action = self->list; action < self->list + self->len; action++) { + for_each_action(self, action) { switch (action->type) { case ACTION_TRACE_OUTPUT: retval = save_trace_to_file(self->trace_output_inst, action->trace_output); diff --git a/tools/tracing/rtla/src/actions.h b/tools/tracing/rtla/src/actions.h index a4f9b570775b..fb77069c972b 100644 --- a/tools/tracing/rtla/src/actions.h +++ b/tools/tracing/rtla/src/actions.h @@ -42,6 +42,11 @@ struct actions { struct tracefs_instance *trace_output_inst; }; +#define for_each_action(actions, action) \ + for ((action) = (actions)->list; \ + (action) < (actions)->list + (actions)->len; \ + (action)++) + void actions_init(struct actions *self); void actions_destroy(struct actions *self); int actions_add_trace_output(struct actions *self, const char *trace_output); -- cgit v1.2.3 From 7e9dfccf8f11c26208211457c4597a466135b56a Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:40 -0300 Subject: rtla: Replace atoi() with a robust strtoi() The atoi() function does not perform error checking, which can lead to undefined behavior when parsing invalid or out-of-range strings. This can cause issues when parsing user-provided numerical inputs, such as signal numbers, PIDs, or CPU lists. To address this, introduce a new strtoi() helper function that safely converts a string to an integer. This function validates the input and checks for overflows, returning a negative value on failure. Replace all calls to atoi() with the new strtoi() function and add proper error handling to make the parsing more robust and prevent potential issues. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-5-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/actions.c | 7 ++++--- tools/tracing/rtla/src/utils.c | 40 +++++++++++++++++++++++++++++++++++----- tools/tracing/rtla/src/utils.h | 2 ++ 3 files changed, 41 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index 31bc98db9228..ace9965ebd55 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -181,12 +181,13 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn) /* Takes two arguments, num (signal) and pid */ while (token != NULL) { if (strlen(token) > 4 && strncmp(token, "num=", 4) == 0) { - signal = atoi(token + 4); + if (strtoi(token + 4, &signal)) + return -1; } else if (strlen(token) > 4 && strncmp(token, "pid=", 4) == 0) { if (strncmp(token + 4, "parent", 7) == 0) pid = -1; - else - pid = atoi(token + 4); + else if (strtoi(token + 4, &pid)) + return -1; } else { /* Invalid argument */ return -1; diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 0b84e02b13df..748b86e6c2cc 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "utils.h" @@ -127,16 +128,18 @@ int parse_cpu_set(char *cpu_list, cpu_set_t *set) nr_cpus = sysconf(_SC_NPROCESSORS_CONF); for (p = cpu_list; *p; ) { - cpu = atoi(p); - if (cpu < 0 || (!cpu && *p != '0') || cpu >= nr_cpus) + if (strtoi(p, &cpu)) + goto err; + if (cpu < 0 || cpu >= nr_cpus) goto err; while (isdigit(*p)) p++; if (*p == '-') { p++; - end_cpu = atoi(p); - if (end_cpu < cpu || (!end_cpu && *p != '0') || end_cpu >= nr_cpus) + if (strtoi(p, &end_cpu)) + goto err; + if (end_cpu < cpu || end_cpu >= nr_cpus) goto err; while (isdigit(*p)) p++; @@ -337,6 +340,7 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr) struct dirent *proc_entry; DIR *procfs; int retval; + int pid; if (strlen(comm_prefix) >= MAX_PATH) { err_msg("Command prefix is too long: %d < strlen(%s)\n", @@ -356,8 +360,12 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr) if (!retval) continue; + if (strtoi(proc_entry->d_name, &pid)) { + err_msg("'%s' is not a valid pid", proc_entry->d_name); + goto out_err; + } /* procfs_is_workload_pid confirmed it is a pid */ - retval = __set_sched_attr(atoi(proc_entry->d_name), attr); + retval = __set_sched_attr(pid, attr); if (retval) { err_msg("Error setting sched attributes for pid:%s\n", proc_entry->d_name); goto out_err; @@ -999,3 +1007,25 @@ char *parse_optional_arg(int argc, char **argv) return NULL; } } + +/* + * strtoi - convert string to integer with error checking + * + * Returns 0 on success, -1 if conversion fails or result is out of int range. + */ +int strtoi(const char *s, int *res) +{ + char *end_ptr; + long lres; + + if (!*s) + return -1; + + errno = 0; + lres = strtol(s, &end_ptr, 0); + if (errno || *end_ptr || lres > INT_MAX || lres < INT_MIN) + return -1; + + *res = (int) lres; + return 0; +} diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h index ed7618842e82..974f7e0188c0 100644 --- a/tools/tracing/rtla/src/utils.h +++ b/tools/tracing/rtla/src/utils.h @@ -3,6 +3,7 @@ #include #include #include +#include /* * '18446744073709551615\0' @@ -81,6 +82,7 @@ static inline int set_deepest_cpu_idle_state(unsigned int cpu, unsigned int stat static inline int have_libcpupower_support(void) { return 0; } #endif /* HAVE_LIBCPUPOWER_SUPPORT */ int auto_house_keeping(cpu_set_t *monitored_cpus); +__attribute__((__warn_unused_result__)) int strtoi(const char *s, int *res); #define ns_to_usf(x) (((double)x/1000)) #define ns_to_per(total, part) ((part * 100) / (double)total) -- cgit v1.2.3 From 9bf942f3c370c9b3af639df04cb5f34daf512dab Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:44 -0300 Subject: rtla: Use standard exit codes for result enum The result enum defines custom values for PASSED, ERROR, and FAILED. These values correspond to standard exit codes EXIT_SUCCESS and EXIT_FAILURE. Update the enum to use the standard macros EXIT_SUCCESS and EXIT_FAILURE to improve readability and adherence to standard C practices. The FAILED value is implicitly assigned EXIT_FAILURE + 1, so there is no need to assign an explicit value. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-9-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/utils.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h index 974f7e0188c0..f7c2a52a0ab5 100644 --- a/tools/tracing/rtla/src/utils.h +++ b/tools/tracing/rtla/src/utils.h @@ -4,6 +4,7 @@ #include #include #include +#include /* * '18446744073709551615\0' @@ -88,7 +89,7 @@ __attribute__((__warn_unused_result__)) int strtoi(const char *s, int *res); #define ns_to_per(total, part) ((part * 100) / (double)total) enum result { - PASSED = 0, /* same as EXIT_SUCCESS */ - ERROR = 1, /* same as EXIT_FAILURE, an error in arguments */ - FAILED = 2, /* test hit the stop tracing condition */ + PASSED = EXIT_SUCCESS, + ERROR = EXIT_FAILURE, + FAILED, /* test hit the stop tracing condition */ }; -- cgit v1.2.3 From d849f3af1cc7a53e3b150a9bbade8f9629445b36 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:45 -0300 Subject: rtla: Remove redundant memset after calloc The actions struct is allocated using calloc, which already returns zeroed memory. The subsequent memset call to zero the 'present' member is therefore redundant. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-10-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/actions.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index ace9965ebd55..d9c1db5d97d4 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -19,8 +19,6 @@ actions_init(struct actions *self) self->len = 0; self->continue_flag = false; - memset(&self->present, 0, sizeof(self->present)); - /* This has to be set by the user */ self->trace_output_inst = NULL; } -- cgit v1.2.3 From f3cc3e4b5116929ebff27c3b0a565b34ae4969b3 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:47 -0300 Subject: rtla: Remove unused headers Remove unused includes for and to clean up the code and reduce unnecessary dependencies. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-12-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/osnoise_hist.c | 1 - tools/tracing/rtla/src/timerlat.c | 1 - tools/tracing/rtla/src/timerlat_top.c | 1 - tools/tracing/rtla/src/trace.c | 1 - 4 files changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 705c73d55102..9d70ea34807f 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c index 8f6cf55f4a94..8f8811f7a13b 100644 --- a/tools/tracing/rtla/src/timerlat.c +++ b/tools/tracing/rtla/src/timerlat.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index f5a809344913..284b74773c2b 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index 69cbc48d53d3..b8be3e28680e 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -2,7 +2,6 @@ #define _GNU_SOURCE #include #include -#include #include #include #include -- cgit v1.2.3 From a0890f9dbd24b302d327fe7dad9b9c5be0e278aa Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:48 -0300 Subject: rtla: Fix NULL pointer dereference in actions_parse The actions_parse() function uses strtok() to tokenize the trigger string, but does not check if the returned token is NULL before passing it to strcmp(). If the trigger parameter is an empty string or contains only delimiter characters, strtok() returns NULL, causing strcmp() to dereference a NULL pointer and crash the program. This issue can be triggered by malformed user input or edge cases in trigger string parsing. Add a NULL check immediately after the strtok() call to validate that a token was successfully extracted before using it. If no token is found, the function now returns -1 to indicate a parsing error. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-13-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/actions.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index d9c1db5d97d4..a42615011962 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -141,6 +141,8 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn) strcpy(trigger_c, trigger); token = strtok(trigger_c, ","); + if (!token) + return -1; if (strcmp(token, "trace") == 0) type = ACTION_TRACE_OUTPUT; -- cgit v1.2.3 From 02689ae385c5e84874620947ac010cf7b4950375 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:50 -0300 Subject: rtla: Add generated output files to gitignore The rtla tool generates various output files during testing and execution, including custom trace outputs and histogram data. These files are artifacts of running the tool with different options and should not be tracked in version control. Add gitignore entries for custom_filename.txt, osnoise_irq_noise_hist.txt, osnoise_trace.txt, and timerlat_trace.txt to prevent accidentally committing these generated files. This aligns with the existing pattern of ignoring build artifacts and generated headers like *.skel.h. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-15-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/tracing/rtla/.gitignore b/tools/tracing/rtla/.gitignore index 1a394ad26cc1..4d39d64ac08c 100644 --- a/tools/tracing/rtla/.gitignore +++ b/tools/tracing/rtla/.gitignore @@ -5,3 +5,7 @@ fixdep feature FEATURE-DUMP *.skel.h +custom_filename.txt +osnoise_irq_noise_hist.txt +osnoise_trace.txt +timerlat_trace.txt -- cgit v1.2.3 From af2962d68b970b15d8910be2b0386b4f147ed78b Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:51 -0300 Subject: rtla: Make stop_tracing variable volatile The stop_tracing global variable is accessed from both the signal handler context and the main program flow without synchronization. This creates a potential race condition where compiler optimizations could cache the variable value in registers, preventing the signal handler's updates from being visible to other parts of the program. Add the volatile qualifier to stop_tracing in both common.c and common.h to ensure all accesses to this variable bypass compiler optimizations and read directly from memory. This guarantees that when the signal handler sets stop_tracing, the change is immediately visible to the main program loop, preventing potential hangs or delayed shutdown when termination signals are received. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-16-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/common.c | 2 +- tools/tracing/rtla/src/common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 6f64c1fc1b62..ceff76a62a30 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -10,7 +10,7 @@ #include "common.h" struct trace_instance *trace_inst; -int stop_tracing; +volatile int stop_tracing; static void stop_trace(int sig) { diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h index ef17ea5be540..7602c5593ef5 100644 --- a/tools/tracing/rtla/src/common.h +++ b/tools/tracing/rtla/src/common.h @@ -54,7 +54,7 @@ struct osnoise_context { }; extern struct trace_instance *trace_inst; -extern int stop_tracing; +extern volatile int stop_tracing; struct hist_params { char no_irq; -- cgit v1.2.3 From 33e3c807ab22bd4002640c8fe47fa30fd4f44ca0 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:52 -0300 Subject: rtla: Ensure null termination after read operations in utils.c Add explicit null termination and buffer initialization for read() operations in procfs_is_workload_pid() and get_self_cgroup() functions. The read() system call does not null-terminate the data it reads, and when the buffer is filled to capacity, subsequent string operations will read past the buffer boundary searching for a null terminator. In procfs_is_workload_pid(), explicitly set buffer[MAX_PATH-1] to '\0' to ensure the buffer is always null-terminated before passing it to strncmp(). In get_self_cgroup(), use memset() to zero the path buffer before reading, which ensures null termination when retval is less than MAX_PATH. Additionally, set path[MAX_PATH-1] to '\0' after the read to handle the case where the buffer is filled completely. These defensive buffer handling practices prevent potential buffer overruns and align with the ongoing buffer safety improvements across the rtla codebase. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-17-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/utils.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 748b86e6c2cc..5273745bc8df 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -317,6 +317,7 @@ static int procfs_is_workload_pid(const char *comm_prefix, struct dirent *proc_e if (retval <= 0) return 0; + buffer[MAX_PATH-1] = '\0'; retval = strncmp(comm_prefix, buffer, strlen(comm_prefix)); if (retval) return 0; @@ -750,6 +751,7 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg) if (fd < 0) return 0; + memset(path, 0, sizeof(path)); retval = read(fd, path, MAX_PATH); close(fd); @@ -757,6 +759,7 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg) if (retval <= 0) return 0; + path[MAX_PATH-1] = '\0'; start = path; start = strstr(start, ":"); -- cgit v1.2.3 From fb8b8183208d8efe824e8d2c73fb1ab5ad1191fd Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Tue, 6 Jan 2026 08:49:53 -0300 Subject: rtla: Fix parse_cpu_set() return value documentation Correct the return value documentation for parse_cpu_set() function in utils.c. The comment incorrectly stated that the function returns 1 on success and 0 on failure, but the actual implementation returns 0 on success and 1 on failure, following the common error-on-nonzero convention used throughout the codebase. This documentation fix ensures that developers reading the code understand the correct return value semantics and prevents potential misuse of the function's return value in conditional checks. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20260106133655.249887-18-wander@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 5273745bc8df..18986a5aed3c 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -113,7 +113,7 @@ void get_duration(time_t start_time, char *output, int output_size) * Receives a cpu list, like 1-3,5 (cpus 1, 2, 3, 5), and then set * filling cpu_set_t argument. * - * Returns 1 on success, 0 otherwise. + * Returns 0 on success, 1 otherwise. */ int parse_cpu_set(char *cpu_list, cpu_set_t *set) { -- cgit v1.2.3 From 1cabad3a00ab2e3d6bf19c5ab8fc9212d0b81e18 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Jan 2026 09:59:33 +0800 Subject: kunit: tool: test: Rename test_data_path() to _test_data_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running the KUnit testsuite through pytest fails, as the function test_data_path() is recognized as a test function. Its execution fails as pytest tries to resolve the 'path' argument as a fixture which does not exist. Rename the function, so the helper function is not incorrectly recognized as a test function. Link: https://lore.kernel.org/r/20260107015936.2316047-1-davidgow@google.com Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 56 +++++++++++++++++----------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index bdc51b5c7b10..30ac1cb6c8ed 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -36,7 +36,7 @@ def setUpModule(): def tearDownModule(): shutil.rmtree(test_tmpdir) -def test_data_path(path): +def _test_data_path(path): return os.path.join(abs_test_data_dir, path) class KconfigTest(unittest.TestCase): @@ -52,7 +52,7 @@ class KconfigTest(unittest.TestCase): self.assertFalse(kconfig1.is_subset_of(kconfig0)) def test_read_from_file(self): - kconfig_path = test_data_path('test_read_from_file.kconfig') + kconfig_path = _test_data_path('test_read_from_file.kconfig') kconfig = kunit_config.parse_file(kconfig_path) @@ -98,7 +98,7 @@ class KUnitParserTest(unittest.TestCase): raise AssertionError(f'"{needle}" not found in {list(backup)}!') def test_output_isolated_correctly(self): - log_path = test_data_path('test_output_isolated_correctly.log') + log_path = _test_data_path('test_output_isolated_correctly.log') with open(log_path) as file: result = kunit_parser.extract_tap_lines(file.readlines()) self.assertContains('TAP version 14', result) @@ -109,7 +109,7 @@ class KUnitParserTest(unittest.TestCase): self.assertContains('ok 1 - example', result) def test_output_with_prefix_isolated_correctly(self): - log_path = test_data_path('test_pound_sign.log') + log_path = _test_data_path('test_pound_sign.log') with open(log_path) as file: result = kunit_parser.extract_tap_lines(file.readlines()) self.assertContains('TAP version 14', result) @@ -138,35 +138,35 @@ class KUnitParserTest(unittest.TestCase): self.assertContains('ok 3 - string-stream-test', result) def test_parse_successful_test_log(self): - all_passed_log = test_data_path('test_is_test_passed-all_passed.log') + all_passed_log = _test_data_path('test_is_test_passed-all_passed.log') with open(all_passed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_successful_nested_tests_log(self): - all_passed_log = test_data_path('test_is_test_passed-all_passed_nested.log') + all_passed_log = _test_data_path('test_is_test_passed-all_passed_nested.log') with open(all_passed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_kselftest_nested(self): - kselftest_log = test_data_path('test_is_test_passed-kselftest.log') + kselftest_log = _test_data_path('test_is_test_passed-kselftest.log') with open(kselftest_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_failed_test_log(self): - failed_log = test_data_path('test_is_test_passed-failure.log') + failed_log = _test_data_path('test_is_test_passed-failure.log') with open(failed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_failed_nested_tests_log(self): - nested_log = test_data_path('test_is_test_passed-failure-nested.log') + nested_log = _test_data_path('test_is_test_passed-failure-nested.log') with open(nested_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) @@ -177,7 +177,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) def test_no_header(self): - empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log') + empty_log = _test_data_path('test_is_test_passed-no_tests_run_no_header.log') with open(empty_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -186,7 +186,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_missing_test_plan(self): - missing_plan_log = test_data_path('test_is_test_passed-' + missing_plan_log = _test_data_path('test_is_test_passed-' 'missing_plan.log') with open(missing_plan_log) as file: result = kunit_parser.parse_run_tests( @@ -197,7 +197,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) def test_no_tests(self): - header_log = test_data_path('test_is_test_passed-no_tests_run_with_header.log') + header_log = _test_data_path('test_is_test_passed-no_tests_run_with_header.log') with open(header_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -206,7 +206,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_no_tests_no_plan(self): - no_plan_log = test_data_path('test_is_test_passed-no_tests_no_plan.log') + no_plan_log = _test_data_path('test_is_test_passed-no_tests_no_plan.log') with open(no_plan_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -218,7 +218,7 @@ class KUnitParserTest(unittest.TestCase): def test_no_kunit_output(self): - crash_log = test_data_path('test_insufficient_memory.log') + crash_log = _test_data_path('test_insufficient_memory.log') print_mock = mock.patch('kunit_printer.Printer.print').start() with open(crash_log) as file: result = kunit_parser.parse_run_tests( @@ -229,7 +229,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_skipped_test(self): - skipped_log = test_data_path('test_skip_tests.log') + skipped_log = _test_data_path('test_skip_tests.log') with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -238,7 +238,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts, kunit_parser.TestCounts(passed=4, skipped=1)) def test_skipped_all_tests(self): - skipped_log = test_data_path('test_skip_all_tests.log') + skipped_log = _test_data_path('test_skip_all_tests.log') with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -246,7 +246,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts, kunit_parser.TestCounts(skipped=5)) def test_ignores_hyphen(self): - hyphen_log = test_data_path('test_strip_hyphen.log') + hyphen_log = _test_data_path('test_strip_hyphen.log') with open(hyphen_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -260,7 +260,7 @@ class KUnitParserTest(unittest.TestCase): result.subtests[1].name) def test_ignores_prefix_printk_time(self): - prefix_log = test_data_path('test_config_printk_time.log') + prefix_log = _test_data_path('test_config_printk_time.log') with open(prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -268,7 +268,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_ignores_multiple_prefixes(self): - prefix_log = test_data_path('test_multiple_prefixes.log') + prefix_log = _test_data_path('test_multiple_prefixes.log') with open(prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -276,7 +276,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_prefix_mixed_kernel_output(self): - mixed_prefix_log = test_data_path('test_interrupted_tap_output.log') + mixed_prefix_log = _test_data_path('test_interrupted_tap_output.log') with open(mixed_prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -284,7 +284,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_prefix_poundsign(self): - pound_log = test_data_path('test_pound_sign.log') + pound_log = _test_data_path('test_pound_sign.log') with open(pound_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -292,7 +292,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_kernel_panic_end(self): - panic_log = test_data_path('test_kernel_panic_interrupt.log') + panic_log = _test_data_path('test_kernel_panic_interrupt.log') with open(panic_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.TEST_CRASHED, result.status) @@ -300,7 +300,7 @@ class KUnitParserTest(unittest.TestCase): self.assertGreaterEqual(result.counts.errors, 1) def test_pound_no_prefix(self): - pound_log = test_data_path('test_pound_no_prefix.log') + pound_log = _test_data_path('test_pound_no_prefix.log') with open(pound_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -329,7 +329,7 @@ class KUnitParserTest(unittest.TestCase): 'Failures: all_failed_suite, some_failed_suite.test2') def test_ktap_format(self): - ktap_log = test_data_path('test_parse_ktap_output.log') + ktap_log = _test_data_path('test_parse_ktap_output.log') with open(ktap_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(result.counts, kunit_parser.TestCounts(passed=3)) @@ -338,13 +338,13 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual('case_2', result.subtests[0].subtests[1].name) def test_parse_subtest_header(self): - ktap_log = test_data_path('test_parse_subtest_header.log') + ktap_log = _test_data_path('test_parse_subtest_header.log') with open(ktap_log) as file: kunit_parser.parse_run_tests(file.readlines(), stdout) self.print_mock.assert_any_call(StrContains('suite (1 subtest)')) def test_parse_attributes(self): - ktap_log = test_data_path('test_parse_attributes.log') + ktap_log = _test_data_path('test_parse_attributes.log') with open(ktap_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -566,7 +566,7 @@ class KUnitJsonTest(unittest.TestCase): self.addCleanup(mock.patch.stopall) def _json_for(self, log_file): - with open(test_data_path(log_file)) as file: + with open(_test_data_path(log_file)) as file: test_result = kunit_parser.parse_run_tests(file, stdout) json_obj = kunit_json.get_json_result( test=test_result, @@ -607,7 +607,7 @@ class StrContains(str): class KUnitMainTest(unittest.TestCase): def setUp(self): - path = test_data_path('test_is_test_passed-all_passed.log') + path = _test_data_path('test_is_test_passed-all_passed.log') with open(path) as file: all_passed_log = file.readlines() -- cgit v1.2.3 From f126d688193b4dd6d0044c19771469724c03f8f8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Jan 2026 09:59:34 +0800 Subject: kunit: tool: test: Don't rely on implicit working directory change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If no kunitconfig_paths are passed to LinuxSourceTree() it falls back to DEFAULT_KUNITCONFIG_PATH. This resolution only works when the current working directory is the root of the source tree. This works by chance when running the full testsuite through the default unittest runner, as some tests will change the current working directory as a side-effect of 'kunit.main()'. When running a single testcase or using pytest, which resets the working directory for each test, this assumption breaks. Explicitly specify an empty kunitconfig for the affected tests. Link: https://lore.kernel.org/r/20260107015936.2316047-2-davidgow@google.com Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 30ac1cb6c8ed..238a31a5cc29 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -477,7 +477,8 @@ class LinuxSourceTreeTest(unittest.TestCase): want_kconfig = kunit_config.Kconfig() want_kconfig.add_entry('NOT_REAL', 'y') - tree = kunit_kernel.LinuxSourceTree('', kconfig_add=['CONFIG_NOT_REAL=y']) + tree = kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[os.devnull], + kconfig_add=['CONFIG_NOT_REAL=y']) self.assertTrue(want_kconfig.is_subset_of(tree._kconfig), msg=tree._kconfig) def test_invalid_arch(self): @@ -489,7 +490,7 @@ class LinuxSourceTreeTest(unittest.TestCase): return subprocess.Popen(['echo "hi\nbye"'], shell=True, text=True, stdout=subprocess.PIPE) with tempfile.TemporaryDirectory('') as build_dir: - tree = kunit_kernel.LinuxSourceTree(build_dir) + tree = kunit_kernel.LinuxSourceTree(build_dir, kunitconfig_paths=[os.devnull]) mock.patch.object(tree._ops, 'start', side_effect=fake_start).start() with self.assertRaises(ValueError): -- cgit v1.2.3 From 8190b9ea30fef5b9067825b91fb3ec6d678ee5e3 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 18 Dec 2025 14:25:59 -0800 Subject: thermal: intel: selftests: workload_hint: Support slow workload hints Add option to enable slow workload type hints. User can specify "slow" as the command line argument to enable slow workload type hints. There are two slow workload type hints: "power" and "performance". Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251218222559.4110027-3-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- .../intel/workload_hint/workload_hint_test.c | 74 +++++++++++++++------- 1 file changed, 52 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c index ca2bd03154e4..569d44f22835 100644 --- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c +++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c @@ -12,6 +12,7 @@ #define WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/notification_delay_ms" #define WORKLOAD_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_hint_enable" +#define WORKLOAD_SLOW_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_slow_hint_enable" #define WORKLOAD_TYPE_INDEX_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_type_index" static const char * const workload_types[] = { @@ -22,6 +23,9 @@ static const char * const workload_types[] = { NULL }; +static int wlt_slow; +static char *wlt_enable_attr; + #define WORKLOAD_TYPE_MAX_INDEX 3 void workload_hint_exit(int signum) @@ -30,7 +34,7 @@ void workload_hint_exit(int signum) /* Disable feature via sysfs knob */ - fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); + fd = open(wlt_enable_attr, O_RDWR); if (fd < 0) { perror("Unable to open workload type feature enable file"); exit(1); @@ -46,6 +50,26 @@ void workload_hint_exit(int signum) close(fd); } +static void update_delay(char *delay_str) +{ + int fd; + + printf("Setting notification delay in ms to %s\n", delay_str); + + fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); + if (fd < 0) { + perror("Unable to open workload notification delay"); + exit(1); + } + + if (write(fd, delay_str, strlen(delay_str)) < 0) { + perror("Can't set delay"); + exit(1); + } + + close(fd); +} + int main(int argc, char **argv) { struct pollfd ufd; @@ -54,32 +78,26 @@ int main(int argc, char **argv) char delay_str[64]; int delay = 0; - printf("Usage: workload_hint_test [notification delay in milli seconds]\n"); + printf("Usage: workload_hint_test [notification delay in milli seconds][slow]\n"); if (argc > 1) { - ret = sscanf(argv[1], "%d", &delay); - if (ret < 0) { - printf("Invalid delay\n"); - exit(1); - } + int i; - printf("Setting notification delay to %d ms\n", delay); - if (delay < 0) - exit(1); + for (i = 1; i < argc; ++i) { + if (!strcmp(argv[i], "slow")) { + wlt_slow = 1; + continue; + } - sprintf(delay_str, "%s\n", argv[1]); - fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); - if (fd < 0) { - perror("Unable to open workload notification delay"); - exit(1); - } + ret = sscanf(argv[1], "%d", &delay); + if (ret < 0) { + printf("Invalid delay\n"); + exit(1); + } - if (write(fd, delay_str, strlen(delay_str)) < 0) { - perror("Can't set delay"); - exit(1); + sprintf(delay_str, "%s\n", argv[1]); + update_delay(delay_str); } - - close(fd); } if (signal(SIGINT, workload_hint_exit) == SIG_IGN) @@ -89,8 +107,13 @@ int main(int argc, char **argv) if (signal(SIGTERM, workload_hint_exit) == SIG_IGN) signal(SIGTERM, SIG_IGN); + if (wlt_slow) + wlt_enable_attr = WORKLOAD_SLOW_ENABLE_ATTRIBUTE; + else + wlt_enable_attr = WORKLOAD_ENABLE_ATTRIBUTE; + /* Enable feature via sysfs knob */ - fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); + fd = open(wlt_enable_attr, O_RDWR); if (fd < 0) { perror("Unable to open workload type feature enable file"); exit(1); @@ -145,6 +168,13 @@ int main(int argc, char **argv) if (ret < 0) break; + if (wlt_slow) { + if (index & 0x10) + printf("workload type slow:%s\n", "power"); + else + printf("workload type slow:%s\n", "performance"); + } + index &= 0x0f; if (index > WORKLOAD_TYPE_MAX_INDEX) printf("Invalid workload type index\n"); -- cgit v1.2.3 From 0b28194c4c8e3a6c2552bfa6451f71b1879dd61f Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Fri, 5 Dec 2025 14:49:37 -0800 Subject: KVM: selftests: Test TPR / CR8 sync and interrupt masking Add a few extra TPR / CR8 tests to x86's xapic_state_test to see if: * TPR is 0 on reset, * TPR, PPR and CR8 are equal inside the guest, * TPR and CR8 read equal by the host after a VMExit * TPR borderline values set by the host correctly mask interrupts in the guest. These hopefully will catch the most obvious cases of improper TPR sync or interrupt masking. Do these tests both in x2APIC and xAPIC modes. The x2APIC mode uses SELF_IPI register to trigger interrupts to give it a bit of exercise too. Signed-off-by: Maciej S. Szmigiero Acked-by: Naveen N Rao (AMD) [sean: put code in separate test] Link: https://patch.msgid.link/20251205224937.428122-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/include/x86/apic.h | 3 + tools/testing/selftests/kvm/x86/xapic_tpr_test.c | 276 +++++++++++++++++++++++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/xapic_tpr_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..3789890421bd 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -124,6 +124,7 @@ TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test +TEST_GEN_PROGS_x86 += x86/xapic_tpr_test TEST_GEN_PROGS_x86 += x86/xcr0_cpuid_test TEST_GEN_PROGS_x86 += x86/xss_msr_test TEST_GEN_PROGS_x86 += x86/debug_regs diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h index 80fe9f69b38d..e9b9aebaac97 100644 --- a/tools/testing/selftests/kvm/include/x86/apic.h +++ b/tools/testing/selftests/kvm/include/x86/apic.h @@ -28,6 +28,8 @@ #define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) #define APIC_TASKPRI 0x80 #define APIC_PROCPRI 0xA0 +#define GET_APIC_PRI(x) (((x) & GENMASK(7, 4)) >> 4) +#define SET_APIC_PRI(x, y) (((x) & ~GENMASK(7, 4)) | (y << 4)) #define APIC_EOI 0xB0 #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1 << 9) @@ -67,6 +69,7 @@ #define APIC_TMICT 0x380 #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 void apic_disable(void); void xapic_enable(void); diff --git a/tools/testing/selftests/kvm/x86/xapic_tpr_test.c b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c new file mode 100644 index 000000000000..3862134d9d40 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +static bool is_x2apic; + +#define IRQ_VECTOR 0x20 + +/* See also the comment at similar assertion in memslot_perf_test.c */ +static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless"); + +static atomic_uint tpr_guest_irq_sync_val; + +static void tpr_guest_irq_sync_flag_reset(void) +{ + atomic_store_explicit(&tpr_guest_irq_sync_val, 0, + memory_order_release); +} + +static unsigned int tpr_guest_irq_sync_val_get(void) +{ + return atomic_load_explicit(&tpr_guest_irq_sync_val, + memory_order_acquire); +} + +static void tpr_guest_irq_sync_val_inc(void) +{ + atomic_fetch_add_explicit(&tpr_guest_irq_sync_val, 1, + memory_order_acq_rel); +} + +static void tpr_guest_irq_handler_xapic(struct ex_regs *regs) +{ + tpr_guest_irq_sync_val_inc(); + + xapic_write_reg(APIC_EOI, 0); +} + +static void tpr_guest_irq_handler_x2apic(struct ex_regs *regs) +{ + tpr_guest_irq_sync_val_inc(); + + x2apic_write_reg(APIC_EOI, 0); +} + +static void tpr_guest_irq_queue(void) +{ + if (is_x2apic) { + x2apic_write_reg(APIC_SELF_IPI, IRQ_VECTOR); + } else { + uint32_t icr, icr2; + + icr = APIC_DEST_SELF | APIC_DEST_PHYSICAL | APIC_DM_FIXED | + IRQ_VECTOR; + icr2 = 0; + + xapic_write_reg(APIC_ICR2, icr2); + xapic_write_reg(APIC_ICR, icr); + } +} + +static uint8_t tpr_guest_tpr_get(void) +{ + uint32_t taskpri; + + if (is_x2apic) + taskpri = x2apic_read_reg(APIC_TASKPRI); + else + taskpri = xapic_read_reg(APIC_TASKPRI); + + return GET_APIC_PRI(taskpri); +} + +static uint8_t tpr_guest_ppr_get(void) +{ + uint32_t procpri; + + if (is_x2apic) + procpri = x2apic_read_reg(APIC_PROCPRI); + else + procpri = xapic_read_reg(APIC_PROCPRI); + + return GET_APIC_PRI(procpri); +} + +static uint8_t tpr_guest_cr8_get(void) +{ + uint64_t cr8; + + asm volatile ("mov %%cr8, %[cr8]\n\t" : [cr8] "=r"(cr8)); + + return cr8 & GENMASK(3, 0); +} + +static void tpr_guest_check_tpr_ppr_cr8_equal(void) +{ + uint8_t tpr; + + tpr = tpr_guest_tpr_get(); + + GUEST_ASSERT_EQ(tpr_guest_ppr_get(), tpr); + GUEST_ASSERT_EQ(tpr_guest_cr8_get(), tpr); +} + +static void tpr_guest_code(void) +{ + cli(); + + if (is_x2apic) + x2apic_enable(); + else + xapic_enable(); + + GUEST_ASSERT_EQ(tpr_guest_tpr_get(), 0); + tpr_guest_check_tpr_ppr_cr8_equal(); + + tpr_guest_irq_queue(); + + /* TPR = 0 but IRQ masked by IF=0, should not fire */ + udelay(1000); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 0); + + sti(); + + /* IF=1 now, IRQ should fire */ + while (tpr_guest_irq_sync_val_get() == 0) + cpu_relax(); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1); + + GUEST_SYNC(true); + tpr_guest_check_tpr_ppr_cr8_equal(); + + tpr_guest_irq_queue(); + + /* IRQ masked by barely high enough TPR now, should not fire */ + udelay(1000); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1); + + GUEST_SYNC(false); + tpr_guest_check_tpr_ppr_cr8_equal(); + + /* TPR barely low enough now to unmask IRQ, should fire */ + while (tpr_guest_irq_sync_val_get() == 1) + cpu_relax(); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 2); + + GUEST_DONE(); +} + +static uint8_t lapic_tpr_get(struct kvm_lapic_state *xapic) +{ + return GET_APIC_PRI(*((u32 *)&xapic->regs[APIC_TASKPRI])); +} + +static void lapic_tpr_set(struct kvm_lapic_state *xapic, uint8_t val) +{ + u32 *taskpri = (u32 *)&xapic->regs[APIC_TASKPRI]; + + *taskpri = SET_APIC_PRI(*taskpri, val); +} + +static uint8_t sregs_tpr(struct kvm_sregs *sregs) +{ + return sregs->cr8 & GENMASK(3, 0); +} + +static void test_tpr_check_tpr_zero(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic_state xapic; + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + TEST_ASSERT_EQ(lapic_tpr_get(&xapic), 0); +} + +static void test_tpr_check_tpr_cr8_equal(struct kvm_vcpu *vcpu) +{ + struct kvm_sregs sregs; + struct kvm_lapic_state xapic; + + vcpu_sregs_get(vcpu, &sregs); + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + TEST_ASSERT_EQ(sregs_tpr(&sregs), lapic_tpr_get(&xapic)); +} + +static void test_tpr_set_tpr_for_irq(struct kvm_vcpu *vcpu, bool mask) +{ + struct kvm_lapic_state xapic; + uint8_t tpr; + + static_assert(IRQ_VECTOR >= 16, "invalid IRQ vector number"); + tpr = IRQ_VECTOR / 16; + if (!mask) + tpr--; + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + lapic_tpr_set(&xapic, tpr); + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic); +} + +static void test_tpr(bool __is_x2apic) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + bool done = false; + + is_x2apic = __is_x2apic; + + vm = vm_create_with_one_vcpu(&vcpu, tpr_guest_code); + if (is_x2apic) { + vm_install_exception_handler(vm, IRQ_VECTOR, + tpr_guest_irq_handler_x2apic); + } else { + vm_install_exception_handler(vm, IRQ_VECTOR, + tpr_guest_irq_handler_xapic); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_X2APIC); + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + } + + sync_global_to_guest(vcpu->vm, is_x2apic); + + /* According to the SDM/APM the TPR value at reset is 0 */ + test_tpr_check_tpr_zero(vcpu); + test_tpr_check_tpr_cr8_equal(vcpu); + + tpr_guest_irq_sync_flag_reset(); + sync_global_to_guest(vcpu->vm, tpr_guest_irq_sync_val); + + while (!done) { + struct ucall uc; + + alarm(2); + vcpu_run(vcpu); + alarm(0); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + test_tpr_check_tpr_cr8_equal(vcpu); + done = true; + break; + case UCALL_SYNC: + test_tpr_check_tpr_cr8_equal(vcpu); + test_tpr_set_tpr_for_irq(vcpu, uc.args[1]); + break; + default: + TEST_FAIL("Unknown ucall result 0x%lx", uc.cmd); + break; + } + } + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + /* + * Use separate VMs for the xAPIC and x2APIC tests so that x2APIC can + * be fully hidden from the guest. KVM disallows changing CPUID after + * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC. + */ + test_tpr(false); + test_tpr(true); +} -- cgit v1.2.3 From 7fe9f5366bd5d7ee3cfd9f66868a4410d6e4792d Mon Sep 17 00:00:00 2001 From: MJ Pooladkhay Date: Mon, 22 Dec 2025 17:42:07 +0000 Subject: KVM: selftests: Fix sign extension bug in get_desc64_base() The function get_desc64_base() performs a series of bitwise left shifts on fields of various sizes. More specifically, when performing '<< 24' on 'desc->base2' (which is a u8), 'base2' is promoted to a signed integer before shifting. In a scenario where base2 >= 0x80, the shift places a 1 into bit 31, causing the 32-bit intermediate value to become negative. When this result is cast to uint64_t or ORed into the return value, sign extension occurs, corrupting the upper 32 bits of the address (base3). Example: Given: base0 = 0x5000 base1 = 0xd6 base2 = 0xf8 base3 = 0xfffffe7c Expected return: 0xfffffe7cf8d65000 Actual return: 0xfffffffff8d65000 Fix this by explicitly casting the fields to 'uint64_t' before shifting to prevent sign extension. Signed-off-by: MJ Pooladkhay Link: https://patch.msgid.link/20251222174207.107331-1-mj@pooladkhay.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 57d62a425109..26a91bb73c93 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -436,8 +436,10 @@ struct kvm_x86_state { static inline uint64_t get_desc64_base(const struct desc64 *desc) { - return ((uint64_t)desc->base3 << 32) | - (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); + return (uint64_t)desc->base3 << 32 | + (uint64_t)desc->base2 << 24 | + (uint64_t)desc->base1 << 16 | + (uint64_t)desc->base0; } static inline uint64_t rdtsc(void) -- cgit v1.2.3 From 69e81ed5e6a59c12c0c6756c3f0524e2ddb023f4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:30 -0800 Subject: KVM: selftests: Make __vm_get_page_table_entry() static The function is only used in processor.c, drop the declaration in processor.h and make it static. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 2 -- tools/testing/selftests/kvm/lib/x86/processor.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 26a91bb73c93..1cb5b4c46b99 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1369,8 +1369,6 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } -uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, - int *level); uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 36104d27f3d9..c14bf2b5f28f 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -306,8 +306,8 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) return *level == current_level; } -uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, - int *level) +static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level) { int va_width = 12 + (vm->pgtable_levels) * 9; uint64_t *pte = &vm->pgd; -- cgit v1.2.3 From 97dfbdfea405a0820ccfcf00afdda4c0f47c3df8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:31 -0800 Subject: KVM: selftests: Stop passing a memslot to nested_map_memslot() On x86, KVM selftests use memslot 0 for all the default regions used by the test infrastructure. This is an implementation detail. nested_map_memslot() is currently used to map the default regions by explicitly passing slot 0, which leaks the library implementation into the caller. Rename the function to a very verbose nested_identity_map_default_memslots() to reflect what it actually does. Add an assertion that only memslot 0 is being used so that the implementation does not change from under us. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 4 ++-- tools/testing/selftests/kvm/lib/x86/vmx.c | 12 ++++++++---- tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 96e2b4c630a9..91916b8aa94b 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -563,8 +563,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr); void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot); +void nested_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm); void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 29b082a58daa..eec33ec63811 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -494,12 +494,16 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot) +void nested_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm) { + uint32_t s, memslot = 0; sparsebit_idx_t i, last; - struct userspace_mem_region *region = - memslot2region(vm, memslot); + struct userspace_mem_region *region = memslot2region(vm, memslot); + + /* Only memslot 0 is mapped here, ensure it's the only one being used */ + for (s = 0; s < NR_MEM_REGIONS; s++) + TEST_ASSERT_EQ(vm->memslots[s], 0); i = (region->region.guest_phys_addr >> vm->page_shift) - 1; last = i + (region->region.memory_size >> vm->page_shift); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 98cb6bdab3e6..aab7333aaef0 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -121,7 +121,7 @@ static void test_vmx_dirty_log(bool enable_ept) */ if (enable_ept) { prepare_eptp(vmx, vm); - nested_map_memslot(vmx, vm, 0); + nested_identity_map_default_memslots(vmx, vm); nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } -- cgit v1.2.3 From 60de423781ad9967bfdd3a2f02ba0a31787b0d2c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:32 -0800 Subject: KVM: selftests: Rename nested TDP mapping functions Rename the functions from nested_* to tdp_* to make their purpose clearer. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-4-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 16 +++---- tools/testing/selftests/kvm/lib/x86/memstress.c | 4 +- tools/testing/selftests/kvm/lib/x86/vmx.c | 50 +++++++++++----------- .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 6 +-- 4 files changed, 37 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 91916b8aa94b..04b8231d032a 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -559,14 +559,14 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr); -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void nested_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm); -void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size); +void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, + uint64_t paddr); +void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, + uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm); +void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 0b1f288ad556..1928b00bde51 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -70,11 +70,11 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) * KVM can shadow the EPT12 with the maximum huge page size supported * by the backing source. */ - nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL); start = align_down(memstress_args.gpa, PG_SIZE_1G); end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); - nested_identity_map_1g(vmx, vm, start, end - start); + tdp_identity_map_1g(vmx, vm, start, end - start); } void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index eec33ec63811..1954ccdfc353 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -362,12 +362,12 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -static void nested_create_pte(struct kvm_vm *vm, - struct eptPageTableEntry *pte, - uint64_t nested_paddr, - uint64_t paddr, - int current_level, - int target_level) +static void tdp_create_pte(struct kvm_vm *vm, + struct eptPageTableEntry *pte, + uint64_t nested_paddr, + uint64_t paddr, + int current_level, + int target_level) { if (!pte->readable) { pte->writable = true; @@ -394,8 +394,8 @@ static void nested_create_pte(struct kvm_vm *vm, } -void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, int target_level) +void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; @@ -428,7 +428,7 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; pte = &pt[index]; - nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level); + tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level); if (pte->page_size) break; @@ -445,10 +445,10 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, } -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) +void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr) { - __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); + __tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); } /* @@ -468,8 +468,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, +void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { size_t page_size = PG_LEVEL_SIZE(level); @@ -479,23 +479,23 @@ void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __nested_pg_map(vmx, vm, nested_paddr, paddr, level); + __tdp_pg_map(vmx, vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size) { - __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); + __tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); } /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void nested_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm) +void tdp_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm) { uint32_t s, memslot = 0; sparsebit_idx_t i, last; @@ -512,18 +512,16 @@ void nested_identity_map_default_memslots(struct vmx_pages *vmx, if (i > last) break; - nested_map(vmx, vm, - (uint64_t)i << vm->page_shift, - (uint64_t)i << vm->page_shift, - 1 << vm->page_shift); + tdp_map(vmx, vm, (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, 1 << vm->page_shift); } } /* Identity map a region with 1GiB Pages. */ -void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, +void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size) { - __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); + __tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index aab7333aaef0..e7d0c08ba29d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -121,9 +121,9 @@ static void test_vmx_dirty_log(bool enable_ept) */ if (enable_ept) { prepare_eptp(vmx, vm); - nested_identity_map_default_memslots(vmx, vm); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_identity_map_default_memslots(vmx, vm); + tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); -- cgit v1.2.3 From b320c03d685704df51cf0774edd799e96c505c74 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:33 -0800 Subject: KVM: selftests: Kill eptPageTablePointer Replace the struct overlay with explicit bitmasks, which is clearer and less error-prone. See commit f18b4aebe107 ("kvm: selftests: do not use bitfields larger than 32-bits for PTEs") for an example of why bitfields are not preferable. Remove the unused PAGE_SHIFT_4K definition while at it. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 37 ++++++++++++++----------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 1954ccdfc353..85043bb1ec4d 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -10,10 +10,16 @@ #include "processor.h" #include "vmx.h" -#define PAGE_SHIFT_4K 12 - #define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 +#define EPTP_MT_SHIFT 0 /* EPTP memtype bits 2:0 */ +#define EPTP_PWL_SHIFT 3 /* EPTP page walk length bits 5:3 */ +#define EPTP_AD_ENABLED_SHIFT 6 /* EPTP AD enabled bit 6 */ + +#define EPTP_WB (X86_MEMTYPE_WB << EPTP_MT_SHIFT) +#define EPTP_PWL_4 (3ULL << EPTP_PWL_SHIFT) /* PWL is (levels - 1) */ +#define EPTP_AD_ENABLED (1ULL << EPTP_AD_ENABLED_SHIFT) + bool enable_evmcs; struct hv_enlightened_vmcs *current_evmcs; @@ -34,14 +40,6 @@ struct eptPageTableEntry { uint64_t suppress_ve:1; }; -struct eptPageTablePointer { - uint64_t memory_type:3; - uint64_t page_walk_length:3; - uint64_t ad_enabled:1; - uint64_t reserved_11_07:5; - uint64_t address:40; - uint64_t reserved_63_52:12; -}; int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; @@ -196,16 +194,15 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); if (vmx->eptp_gpa) { - uint64_t ept_paddr; - struct eptPageTablePointer eptp = { - .memory_type = X86_MEMTYPE_WB, - .page_walk_length = 3, /* + 1 */ - .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS), - .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, - }; - - memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); - vmwrite(EPT_POINTER, ept_paddr); + uint64_t eptp = vmx->eptp_gpa | EPTP_WB | EPTP_PWL_4; + + TEST_ASSERT((vmx->eptp_gpa & ~PHYSICAL_PAGE_MASK) == 0, + "Illegal bits set in vmx->eptp_gpa"); + + if (ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS)) + eptp |= EPTP_AD_ENABLED; + + vmwrite(EPT_POINTER, eptp); sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; } -- cgit v1.2.3 From 3cd5002807bebf504cbb6645e73d01204324e54a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:34 -0800 Subject: KVM: selftests: Stop setting A/D bits when creating EPT PTEs Stop setting Accessed/Dirty bits when creating EPT entries for L2 so that the stage-1 and stage-2 (a.k.a. TDP) page table APIs can use common code without bleeding the EPT hack into the common APIs. While commit 094444204570 ("selftests: kvm: add test for dirty logging inside nested guests") is _very_ light on details, the most likely explanation is that vmx_dirty_log_test was attempting to avoid taking an EPT Violation on the first _write_ from L2. static void l2_guest_code(u64 *a, u64 *b) { READ_ONCE(*a); WRITE_ONCE(*a, 1); <=== GUEST_SYNC(true); ... } When handling read faults in the shadow MMU, KVM opportunistically creates a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept writes in order to emulate Dirty-bit updates. By setting A/D bits in the test's EPT entries, the above READ+WRITE will fault only on the read, and in theory expose the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration with PML"). If the Dirty bit is NOT set, the test will get a false pass due; though again, in theory. However, the test is flawed (and always was, at least in the versions posted publicly), as KVM (correctly) marks the corresponding L1 GFN as dirty (in the dirty bitmap) when creating the writable SPTE. I.e. without a check on the dirty bitmap after the READ_ONCE(), the check after the first WRITE_ONCE() will get a false pass due to the dirty bitmap/log having been updated by the read fault, not by PML. Furthermore, the subsequent behavior in the test's l2_guest_code() effectively hides the flawed test behavior, as the straight writes to a new L2 GPA fault also trigger the KVM bug, and so the test will still detect the failure due to lack of isolation between the two testcases (Read=>Write vs. Write=>Write). WRITE_ONCE(*b, 1); GUEST_SYNC(true); WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); Punt on fixing vmx_dirty_log_test for the moment as it will be easier to properly fix the test once the TDP code uses the common MMU APIs, at which point it will be trivially easy for the test to retrieve the EPT PTE and set the Dirty bit as needed. Signed-off-by: Yosry Ahmed [sean: rewrite changelog to explain the situation] Link: https://patch.msgid.link/20251230230150.4150236-6-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 85043bb1ec4d..a3e2eae981da 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -432,14 +432,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, pt = addr_gpa2hva(vm, pte->address * vm->page_size); } - - /* - * For now mark these as accessed and dirty because the only - * testcase we have needs that. Can be reconsidered later. - */ - pte->accessed = true; - pte->dirty = true; - } void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, -- cgit v1.2.3 From 9f073ac25b4c4cf3b3ea13b155035108c54148bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:35 -0800 Subject: KVM: selftests: Add "struct kvm_mmu" to track a given MMU instance Add a "struct kvm_mmu" to track a given MMU instance, e.g. a VM's stage-1 MMU versus a VM's stage-2 MMU, so that x86 can share MMU functionality for both stage-1 and stage-2 MMUs, without creating the potential for subtle bugs, e.g. due to consuming on vm->pgtable_levels when operating a stage-2 MMU. Encapsulate the existing de facto MMU in "struct kvm_vm", e.g instead of burying the MMU details in "struct kvm_vm_arch", to avoid more #ifdefs in ____vm_create(), and in the hopes that other architectures can utilize the formalized MMU structure if/when they too support stage-2 page tables. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-7-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 11 +++++-- tools/testing/selftests/kvm/lib/arm64/processor.c | 38 +++++++++++----------- tools/testing/selftests/kvm/lib/kvm_util.c | 28 ++++++++-------- .../selftests/kvm/lib/loongarch/processor.c | 28 ++++++++-------- tools/testing/selftests/kvm/lib/riscv/processor.c | 31 +++++++++--------- tools/testing/selftests/kvm/lib/s390/processor.c | 16 ++++----- tools/testing/selftests/kvm/lib/x86/processor.c | 28 ++++++++-------- .../selftests/kvm/x86/vmx_nested_la57_state_test.c | 2 +- 8 files changed, 94 insertions(+), 88 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..39558c05c0bf 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -88,12 +88,17 @@ enum kvm_mem_region_type { NR_MEM_REGIONS, }; +struct kvm_mmu { + bool pgd_created; + uint64_t pgd; + int pgtable_levels; +}; + struct kvm_vm { int mode; unsigned long type; int kvm_fd; int fd; - unsigned int pgtable_levels; unsigned int page_size; unsigned int page_shift; unsigned int pa_bits; @@ -104,13 +109,13 @@ struct kvm_vm { struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; - bool pgd_created; vm_paddr_t ucall_mmio_addr; - vm_paddr_t pgd; vm_vaddr_t handlers; uint32_t dirty_ring_size; uint64_t gpa_tag_mask; + struct kvm_mmu mmu; + struct kvm_vm_arch arch; struct kvm_binary_stats stats; diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index d46e4b13b92c..c40f59d48311 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -28,7 +28,7 @@ static uint64_t page_align(struct kvm_vm *vm, uint64_t v) static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) { - unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->va_bits - shift)) - 1; return (gva >> shift) & mask; @@ -39,7 +39,7 @@ static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva) unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; - TEST_ASSERT(vm->pgtable_levels == 4, + TEST_ASSERT(vm->mmu.pgtable_levels == 4, "Mode %d does not have 4 page table levels", vm->mode); return (gva >> shift) & mask; @@ -50,7 +50,7 @@ static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva) unsigned int shift = (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; - TEST_ASSERT(vm->pgtable_levels >= 3, + TEST_ASSERT(vm->mmu.pgtable_levels >= 3, "Mode %d does not have >= 3 page table levels", vm->mode); return (gva >> shift) & mask; @@ -104,7 +104,7 @@ static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte) static uint64_t ptrs_per_pgd(struct kvm_vm *vm) { - unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; return 1 << (vm->va_bits - shift); } @@ -117,13 +117,13 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; - vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, - vm->memslots[MEM_REGION_PT]); - vm->pgd_created = true; + vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->mmu.pgd_created = true; } static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -147,12 +147,12 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, vaddr) * 8; if (!*ptep) *ptep = addr_pte(vm, vm_alloc_page_table(vm), PGD_TYPE_TABLE | PTE_VALID); - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; if (!*ptep) @@ -190,16 +190,16 @@ uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level { uint64_t *ptep; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, gva) * 8; if (!ptep) goto unmapped_gva; if (level == 0) return ptep; - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; if (!ptep) @@ -263,13 +263,13 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - int level = 4 - (vm->pgtable_levels - 1); + int level = 4 - (vm->mmu.pgtable_levels - 1); uint64_t pgd, *ptep; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { + for (pgd = vm->mmu.pgd; pgd < vm->mmu.pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; @@ -350,7 +350,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift); + ttbr0_el1 = vm->mmu.pgd & GENMASK(47, vm->page_shift); /* Configure output size */ switch (vm->mode) { @@ -358,7 +358,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) case VM_MODE_P52V48_16K: case VM_MODE_P52V48_64K: tcr_el1 |= TCR_IPS_52_BITS; - ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2; + ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->mmu.pgd) << 2; break; case VM_MODE_P48V48_4K: case VM_MODE_P48V48_16K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..65752daeed90 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -281,34 +281,34 @@ struct kvm_vm *____vm_create(struct vm_shape shape) /* Setup mode specific traits. */ switch (vm->mode) { case VM_MODE_P52V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P52V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P48V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P48V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P40V48_4K: case VM_MODE_P36V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P40V48_64K: case VM_MODE_P36V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P52V48_16K: case VM_MODE_P48V48_16K: case VM_MODE_P40V48_16K: case VM_MODE_P36V48_16K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P47V47_16K: case VM_MODE_P36V47_16K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ @@ -321,22 +321,22 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->va_bits); if (vm->va_bits == 57) { - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; } else { TEST_ASSERT(vm->va_bits == 48, "Unexpected guest virtual address width: %d", vm->va_bits); - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; } #else TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; break; case VM_MODE_P44V64_4K: - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; break; default: TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); @@ -1956,8 +1956,8 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); sparsebit_dump(stream, vm->vpages_mapped, indent + 2); fprintf(stream, "%*spgd_created: %u\n", indent, "", - vm->pgd_created); - if (vm->pgd_created) { + vm->mmu.pgd_created); + if (vm->mmu.pgd_created) { fprintf(stream, "%*sVirtual Translation Tables:\n", indent + 2, ""); virt_dump(stream, vm, indent + 4); diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 07c103369ddb..17aa55a2047a 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -50,11 +50,11 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) int i; vm_paddr_t child, table; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; child = table = 0; - for (i = 0; i < vm->pgtable_levels; i++) { + for (i = 0; i < vm->mmu.pgtable_levels; i++) { invalid_pgtable[i] = child; table = vm_phy_page_alloc(vm, LOONGARCH_PAGE_TABLE_PHYS_MIN, vm->memslots[MEM_REGION_PT]); @@ -62,8 +62,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) virt_set_pgtable(vm, table, child); child = table; } - vm->pgd = table; - vm->pgd_created = true; + vm->mmu.pgd = table; + vm->mmu.pgd_created = true; } static int virt_pte_none(uint64_t *ptep, int level) @@ -77,11 +77,11 @@ static uint64_t *virt_populate_pte(struct kvm_vm *vm, vm_vaddr_t gva, int alloc) uint64_t *ptep; vm_paddr_t child; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - child = vm->pgd; - level = vm->pgtable_levels - 1; + child = vm->mmu.pgd; + level = vm->mmu.pgtable_levels - 1; while (level > 0) { ptep = addr_gpa2hva(vm, child) + virt_pte_index(vm, gva, level) * 8; if (virt_pte_none(ptep, level)) { @@ -161,11 +161,11 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - level = vm->pgtable_levels - 1; - pte_dump(stream, vm, indent, vm->pgd, level); + level = vm->mmu.pgtable_levels - 1; + pte_dump(stream, vm, indent, vm->mmu.pgd, level); } void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) @@ -297,7 +297,7 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) width = vm->page_shift - 3; - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: /* pud page shift and width */ val = (vm->page_shift + width * 2) << 20 | (width << 25); @@ -309,15 +309,15 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) val |= vm->page_shift | width << 5; break; default: - TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->pgtable_levels); + TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->mmu.pgtable_levels); } loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL0, val); /* PGD page shift and width */ - val = (vm->page_shift + width * (vm->pgtable_levels - 1)) | width << 6; + val = (vm->page_shift + width * (vm->mmu.pgtable_levels - 1)) | width << 6; loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL1, val); - loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->pgd); + loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->mmu.pgd); /* * Refill exception runs on real mode diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..e6ec7c224fc3 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -60,7 +60,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { TEST_ASSERT(level > -1, "Negative page table level (%d) not possible", level); - TEST_ASSERT(level < vm->pgtable_levels, + TEST_ASSERT(level < vm->mmu.pgtable_levels, "Invalid page table level (%d)", level); return (gva & pte_index_mask[level]) >> pte_index_shift[level]; @@ -70,19 +70,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; - vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, - vm->memslots[MEM_REGION_PT]); - vm->pgd_created = true; + vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->mmu.pgd_created = true; } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t *ptep, next_ppn; - int level = vm->pgtable_levels - 1; + int level = vm->mmu.pgtable_levels - 1; TEST_ASSERT((vaddr % vm->page_size) == 0, "Virtual address not on page boundary,\n" @@ -98,7 +98,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, vaddr, level) * 8; if (!*ptep) { next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT; *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | @@ -126,12 +126,12 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; - int level = vm->pgtable_levels - 1; + int level = vm->mmu.pgtable_levels - 1; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, gva, level) * 8; if (!ptep) goto unmapped_gva; level--; @@ -176,13 +176,14 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - int level = vm->pgtable_levels - 1; + struct kvm_mmu *mmu = &vm->mmu; + int level = mmu->pgtable_levels - 1; uint64_t pgd, *ptep; - if (!vm->pgd_created) + if (!mmu->pgd_created) return; - for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { + for (pgd = mmu->pgd; pgd < mmu->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; @@ -211,7 +212,7 @@ void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; + satp = (vm->mmu.pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; satp |= SATP_MODE_48; vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp); diff --git a/tools/testing/selftests/kvm/lib/s390/processor.c b/tools/testing/selftests/kvm/lib/s390/processor.c index 8ceeb17c819a..6a9a660413a7 100644 --- a/tools/testing/selftests/kvm/lib/s390/processor.c +++ b/tools/testing/selftests/kvm/lib/s390/processor.c @@ -17,7 +17,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, @@ -25,8 +25,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) vm->memslots[MEM_REGION_PT]); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); - vm->pgd = paddr; - vm->pgd_created = true; + vm->mmu.pgd = paddr; + vm->mmu.pgd_created = true; } /* @@ -70,7 +70,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) gva, vm->max_gfn, vm->page_size); /* Walk through region and segment tables */ - entry = addr_gpa2hva(vm, vm->pgd); + entry = addr_gpa2hva(vm, vm->mmu.pgd); for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) @@ -94,7 +94,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); - entry = addr_gpa2hva(vm, vm->pgd); + entry = addr_gpa2hva(vm, vm->mmu.pgd); for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID), @@ -149,10 +149,10 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - virt_dump_region(stream, vm, indent, vm->pgd); + virt_dump_region(stream, vm, indent, vm->mmu.pgd); } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -184,7 +184,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu_sregs_get(vcpu, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ - sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ + sregs.crs[1] = vm->mmu.pgd | 0xf; /* Primary region table */ vcpu_sregs_set(vcpu, &sregs); vcpu->run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index c14bf2b5f28f..f027f86d1535 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -162,9 +162,9 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) "Unknown or unsupported guest mode: 0x%x", vm->mode); /* If needed, create the top-level page table. */ - if (!vm->pgd_created) { - vm->pgd = vm_alloc_page_table(vm); - vm->pgd_created = true; + if (!vm->mmu.pgd_created) { + vm->mmu.pgd = vm_alloc_page_table(vm); + vm->mmu.pgd_created = true; } } @@ -175,7 +175,7 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd, + TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd, "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -218,7 +218,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pte = &vm->pgd; + uint64_t *pte = &vm->mmu.pgd; int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -243,7 +243,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - for (current_level = vm->pgtable_levels; + for (current_level = vm->mmu.pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { pte = virt_create_upper_pte(vm, pte, vaddr, paddr, @@ -309,14 +309,14 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - int va_width = 12 + (vm->pgtable_levels) * 9; - uint64_t *pte = &vm->pgd; + int va_width = 12 + (vm->mmu.pgtable_levels) * 9; + uint64_t *pte = &vm->mmu.pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -332,7 +332,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), "Canonical check failed. The virtual address is invalid."); - for (current_level = vm->pgtable_levels; + for (current_level = vm->mmu.pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { pte = virt_get_pte(vm, pte, vaddr, current_level); @@ -357,7 +357,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) uint64_t *pde, *pde_start; uint64_t *pte, *pte_start; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; fprintf(stream, "%*s " @@ -365,7 +365,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; if (!(*pml4e & PTE_PRESENT_MASK)) @@ -538,7 +538,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; - if (vm->pgtable_levels == 5) + if (vm->mmu.pgtable_levels == 5) sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); @@ -549,7 +549,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_seg_set_kernel_data_64bit(&sregs.gs); kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); - sregs.cr3 = vm->pgd; + sregs.cr3 = vm->mmu.pgd; vcpu_sregs_set(vcpu, &sregs); } diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c index cf1d2d1f2a8f..915c42001dba 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) * L1 needs to read its own PML5 table to set up L2. Identity map * the PML5 table to facilitate this. */ - virt_map(vm, vm->pgd, vm->pgd, 1); + virt_map(vm, vm->mmu.pgd, vm->mmu.pgd, 1); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); -- cgit v1.2.3 From 11825209f5494098da6cab666d8a767650c1c0cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:36 -0800 Subject: KVM: selftests: Plumb "struct kvm_mmu" into x86's MMU APIs In preparation for generalizing the x86 virt mapping APIs to work with TDP (stage-2) page tables, plumb "struct kvm_mmu" into all of the helper functions instead of operating on vm->mmu directly. Opportunistically swap the order of the check in virt_get_pte() to first assert that the parent is the PGD, and then check that the PTE is present, as it makes more sense to check if the parent PTE is the PGD/root (i.e. not a PTE) before checking that the PTE is PRESENT. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed [sean: rebase on common kvm_mmu structure, rewrite changelog] Link: https://patch.msgid.link/20251230230150.4150236-8-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 3 +- tools/testing/selftests/kvm/lib/x86/processor.c | 64 +++++++++++++--------- 2 files changed, 39 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 1cb5b4c46b99..43970a96baed 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1451,7 +1451,8 @@ enum pg_level { #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); +void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index f027f86d1535..f25742a804b0 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -156,26 +156,31 @@ bool kvm_is_tdp_enabled(void) return get_kvm_amd_param_bool("npt"); } +static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu) +{ + /* If needed, create the top-level page table. */ + if (!mmu->pgd_created) { + mmu->pgd = vm_alloc_page_table(vm); + mmu->pgd_created = true; + } +} + void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create the top-level page table. */ - if (!vm->mmu.pgd_created) { - vm->mmu.pgd = vm_alloc_page_table(vm); - vm->mmu.pgd_created = true; - } + virt_mmu_init(vm, &vm->mmu); } -static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, - uint64_t vaddr, int level) +static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, + uint64_t *parent_pte, uint64_t vaddr, int level) { uint64_t pt_gpa = PTE_GET_PA(*parent_pte); uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd, + TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK), "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -183,13 +188,14 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, } static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + struct kvm_mmu *mmu, uint64_t *parent_pte, uint64_t vaddr, uint64_t paddr, int current_level, int target_level) { - uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); + uint64_t *pte = virt_get_pte(vm, mmu, parent_pte, vaddr, current_level); paddr = vm_untag_gpa(vm, paddr); @@ -215,10 +221,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, return pte; } -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) +void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pte = &vm->mmu.pgd; + uint64_t *pte = &mmu->pgd; int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -243,17 +250,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - for (current_level = vm->mmu.pgtable_levels; + for (current_level = mmu->pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { - pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr, current_level, level); if (*pte & PTE_LARGE_MASK) return; } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -270,7 +277,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { - __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); + __virt_pg_map(vm, &vm->mmu, vaddr, paddr, PG_LEVEL_4K); } void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -285,7 +292,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, nr_bytes, pg_size); for (i = 0; i < nr_pages; i++) { - __virt_pg_map(vm, vaddr, paddr, level); + __virt_pg_map(vm, &vm->mmu, vaddr, paddr, level); sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, nr_bytes / PAGE_SIZE); @@ -294,7 +301,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) +static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, + int *level, int current_level) { if (*pte & PTE_LARGE_MASK) { TEST_ASSERT(*level == PG_LEVEL_NONE || @@ -306,17 +314,19 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) return *level == current_level; } -static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, +static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, + struct kvm_mmu *mmu, + uint64_t vaddr, int *level) { - int va_width = 12 + (vm->mmu.pgtable_levels) * 9; - uint64_t *pte = &vm->mmu.pgd; + int va_width = 12 + (mmu->pgtable_levels) * 9; + uint64_t *pte = &mmu->pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= mmu->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -332,22 +342,22 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), "Canonical check failed. The virtual address is invalid."); - for (current_level = vm->mmu.pgtable_levels; + for (current_level = mmu->pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { - pte = virt_get_pte(vm, pte, vaddr, current_level); - if (vm_is_target_pte(pte, level, current_level)) + pte = virt_get_pte(vm, mmu, pte, vaddr, current_level); + if (vm_is_target_pte(mmu, pte, level, current_level)) return pte; } - return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; - return __vm_get_page_table_entry(vm, vaddr, &level); + return __vm_get_page_table_entry(vm, &vm->mmu, vaddr, &level); } void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) @@ -497,7 +507,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int level = PG_LEVEL_NONE; - uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level); + uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level); TEST_ASSERT(*pte & PTE_PRESENT_MASK, "Leaf PTE not PRESENT for gva: 0x%08lx", gva); -- cgit v1.2.3 From 3d0e7595e81017558189d2252ae9453c57ab3436 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:37 -0800 Subject: KVM: selftests: Add a "struct kvm_mmu_arch arch" member to kvm_mmu Add an arch structure+field in "struct kvm_mmu" so that architectures can track arch-specific information for a given MMU. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-9-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/include/kvm_util.h | 2 ++ tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/s390/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++ 6 files changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h index b973bb2c64a6..4a2033708227 100644 --- a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h @@ -2,6 +2,8 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; + struct kvm_vm_arch { bool has_gic; int gic_fd; diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 39558c05c0bf..c1497515fa6a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -92,6 +92,8 @@ struct kvm_mmu { bool pgd_created; uint64_t pgd; int pgtable_levels; + + struct kvm_mmu_arch arch; }; struct kvm_vm { diff --git a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 972bb1c4ab4c..456e5ca170df 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -10,6 +10,8 @@ extern bool is_forced_emulation_enabled; +struct kvm_mmu_arch {}; + struct kvm_vm_arch { vm_vaddr_t gdt; vm_vaddr_t tss; -- cgit v1.2.3 From 6dd70757213fc046e5f86901e4baf11ed894da6b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:38 -0800 Subject: KVM: selftests: Move PTE bitmasks to kvm_mmu Move the PTE bitmasks into kvm_mmu to parameterize them for virt mapping functions. Introduce helpers to read/write different PTE bits given a kvm_mmu. Drop the 'global' bit definition as it's currently unused, but leave the 'user' bit as it will be used in coming changes. Opportunisitcally rename 'large' to 'huge' as it's more consistent with the kernel naming. Leave PHYSICAL_PAGE_MASK alone, it's fixed in all page table formats and a lot of other macros depend on it. It's tempting to move all the other macros to be per-struct instead, but it would be too much noise for little benefit. Keep c_bit and s_bit in vm->arch as they used before the MMU is initialized, through __vmcreate() -> vm_userspace_mem_region_add() -> vm_mem_add() -> vm_arch_has_protected_memory(). No functional change intended. Signed-off-by: Yosry Ahmed [sean: rename accessors to is__pte()] Link: https://patch.msgid.link/20251230230150.4150236-10-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 16 ++++- .../testing/selftests/kvm/include/x86/processor.h | 28 ++++++--- tools/testing/selftests/kvm/lib/x86/processor.c | 71 +++++++++++++--------- 3 files changed, 76 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 456e5ca170df..bad381d63b6a 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -10,7 +10,21 @@ extern bool is_forced_emulation_enabled; -struct kvm_mmu_arch {}; +struct pte_masks { + uint64_t present; + uint64_t writable; + uint64_t user; + uint64_t accessed; + uint64_t dirty; + uint64_t huge; + uint64_t nx; + uint64_t c; + uint64_t s; +}; + +struct kvm_mmu_arch { + struct pte_masks pte_masks; +}; struct kvm_vm_arch { vm_vaddr_t gdt; diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 43970a96baed..55dac84cd4a7 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -362,16 +362,6 @@ static inline unsigned int x86_model(unsigned int eax) return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); } -/* Page table bitfield declarations */ -#define PTE_PRESENT_MASK BIT_ULL(0) -#define PTE_WRITABLE_MASK BIT_ULL(1) -#define PTE_USER_MASK BIT_ULL(2) -#define PTE_ACCESSED_MASK BIT_ULL(5) -#define PTE_DIRTY_MASK BIT_ULL(6) -#define PTE_LARGE_MASK BIT_ULL(7) -#define PTE_GLOBAL_MASK BIT_ULL(8) -#define PTE_NX_MASK BIT_ULL(63) - #define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) #define PAGE_SHIFT 12 @@ -1451,6 +1441,24 @@ enum pg_level { #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) +#define PTE_PRESENT_MASK(mmu) ((mmu)->arch.pte_masks.present) +#define PTE_WRITABLE_MASK(mmu) ((mmu)->arch.pte_masks.writable) +#define PTE_USER_MASK(mmu) ((mmu)->arch.pte_masks.user) +#define PTE_ACCESSED_MASK(mmu) ((mmu)->arch.pte_masks.accessed) +#define PTE_DIRTY_MASK(mmu) ((mmu)->arch.pte_masks.dirty) +#define PTE_HUGE_MASK(mmu) ((mmu)->arch.pte_masks.huge) +#define PTE_NX_MASK(mmu) ((mmu)->arch.pte_masks.nx) +#define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) +#define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) + +#define is_present_pte(mmu, pte) (!!(*(pte) & PTE_PRESENT_MASK(mmu))) +#define is_writable_pte(mmu, pte) (!!(*(pte) & PTE_WRITABLE_MASK(mmu))) +#define is_user_pte(mmu, pte) (!!(*(pte) & PTE_USER_MASK(mmu))) +#define is_accessed_pte(mmu, pte) (!!(*(pte) & PTE_ACCESSED_MASK(mmu))) +#define is_dirty_pte(mmu, pte) (!!(*(pte) & PTE_DIRTY_MASK(mmu))) +#define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) +#define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) + void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index f25742a804b0..3800f4ff6770 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -156,12 +156,14 @@ bool kvm_is_tdp_enabled(void) return get_kvm_amd_param_bool("npt"); } -static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu) +static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu, + struct pte_masks *pte_masks) { /* If needed, create the top-level page table. */ if (!mmu->pgd_created) { mmu->pgd = vm_alloc_page_table(vm); mmu->pgd_created = true; + mmu->arch.pte_masks = *pte_masks; } } @@ -170,7 +172,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, "Unknown or unsupported guest mode: 0x%x", vm->mode); - virt_mmu_init(vm, &vm->mmu); + struct pte_masks pte_masks = (struct pte_masks){ + .present = BIT_ULL(0), + .writable = BIT_ULL(1), + .user = BIT_ULL(2), + .accessed = BIT_ULL(5), + .dirty = BIT_ULL(6), + .huge = BIT_ULL(7), + .nx = BIT_ULL(63), + .c = vm->arch.c_bit, + .s = vm->arch.s_bit, + }; + + virt_mmu_init(vm, &vm->mmu, &pte_masks); } static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, @@ -180,7 +194,7 @@ static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK), + TEST_ASSERT((*parent_pte == mmu->pgd) || is_present_pte(mmu, parent_pte), "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -199,10 +213,10 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, paddr = vm_untag_gpa(vm, paddr); - if (!(*pte & PTE_PRESENT_MASK)) { - *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (!is_present_pte(mmu, pte)) { + *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu); if (current_level == target_level) - *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); + *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; } else { @@ -214,7 +228,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, TEST_ASSERT(current_level != target_level, "Cannot create hugepage at level: %u, vaddr: 0x%lx", current_level, vaddr); - TEST_ASSERT(!(*pte & PTE_LARGE_MASK), + TEST_ASSERT(!is_huge_pte(mmu, pte), "Cannot create page table at level: %u, vaddr: 0x%lx", current_level, vaddr); } @@ -255,24 +269,24 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, current_level--) { pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr, current_level, level); - if (*pte & PTE_LARGE_MASK) + if (is_huge_pte(mmu, pte)) return; } /* Fill in page table entry. */ pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); - TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), + TEST_ASSERT(!is_present_pte(mmu, pte), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); - *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); + *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final * leaf PTE needs manually set the C/S-bit. */ if (vm_is_gpa_protected(vm, paddr)) - *pte |= vm->arch.c_bit; + *pte |= PTE_C_BIT_MASK(mmu); else - *pte |= vm->arch.s_bit; + *pte |= PTE_S_BIT_MASK(mmu); } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -304,7 +318,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, int *level, int current_level) { - if (*pte & PTE_LARGE_MASK) { + if (is_huge_pte(mmu, pte)) { TEST_ASSERT(*level == PG_LEVEL_NONE || *level == current_level, "Unexpected hugepage at level %d", current_level); @@ -362,12 +376,13 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + struct kvm_mmu *mmu = &vm->mmu; uint64_t *pml4e, *pml4e_start; uint64_t *pdpe, *pdpe_start; uint64_t *pde, *pde_start; uint64_t *pte, *pte_start; - if (!vm->mmu.pgd_created) + if (!mmu->pgd_created) return; fprintf(stream, "%*s " @@ -375,47 +390,47 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, mmu->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!(*pml4e & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pml4e)) continue; fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), - !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); + is_writable_pte(mmu, pml4e), is_nx_pte(mmu, pml4e)); pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!(*pdpe & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pdpe)) continue; fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), - !!(*pdpe & PTE_NX_MASK)); + PTE_GET_PFN(*pdpe), is_writable_pte(mmu, pdpe), + is_nx_pte(mmu, pdpe)); pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!(*pde & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pde)) continue; fprintf(stream, "%*spde 0x%-3zx %p " "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), - !!(*pde & PTE_NX_MASK)); + PTE_GET_PFN(*pde), is_writable_pte(mmu, pde), + is_nx_pte(mmu, pde)); pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!(*pte & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pte)) continue; fprintf(stream, "%*spte 0x%-3zx %p " "0x%-12lx 0x%-10llx %u %u " @@ -424,9 +439,9 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) pte - pte_start, pte, addr_hva2gpa(vm, pte), PTE_GET_PFN(*pte), - !!(*pte & PTE_WRITABLE_MASK), - !!(*pte & PTE_NX_MASK), - !!(*pte & PTE_DIRTY_MASK), + is_writable_pte(mmu, pte), + is_nx_pte(mmu, pte), + is_dirty_pte(mmu, pte), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -509,7 +524,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) int level = PG_LEVEL_NONE; uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level); - TEST_ASSERT(*pte & PTE_PRESENT_MASK, + TEST_ASSERT(is_present_pte(&vm->mmu, pte), "Leaf PTE not PRESENT for gva: 0x%08lx", gva); /* -- cgit v1.2.3 From f00f519cebcd4280c4ce4fab8133ed2dcfa8f95a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:39 -0800 Subject: KVM: selftests: Use a TDP MMU to share EPT page tables between vCPUs prepare_eptp() currently allocates new EPTs for each vCPU. memstress has its own hack to share the EPTs between vCPUs. Currently, there is no reason to have separate EPTs for each vCPU, and the complexity is significant. The only reason it doesn't matter now is because memstress is the only user with multiple vCPUs. Add vm_enable_ept() to allocate EPT page tables for an entire VM, and use it everywhere to replace prepare_eptp(). Drop 'eptp' and 'eptp_hva' from 'struct vmx_pages' as they serve no purpose (e.g. the EPTP can be built from the PGD), but keep 'eptp_gpa' so that the MMU structure doesn't need to be passed in along with vmx_pages. Dynamically allocate the TDP MMU structure to avoid a cyclical dependency between kvm_util_arch.h and kvm_util.h. Remove the workaround in memstress to copy the EPT root between vCPUs since that's now the default behavior. Name the MMU tdp_mmu instead of e.g. nested_mmu or nested.mmu to avoid recreating the same mess that KVM has with respect to "nested" MMUs, e.g. does nested refer to the stage-2 page tables created by L1, or the stage-1 page tables created by L2? Signed-off-by: Yosry Ahmed Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251230230150.4150236-11-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 4 +++ .../testing/selftests/kvm/include/x86/processor.h | 3 +++ tools/testing/selftests/kvm/include/x86/vmx.h | 8 +++--- tools/testing/selftests/kvm/lib/x86/memstress.c | 19 +++++--------- tools/testing/selftests/kvm/lib/x86/processor.c | 9 +++++++ tools/testing/selftests/kvm/lib/x86/vmx.c | 30 ++++++++++++++-------- .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 7 +++-- 7 files changed, 48 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index bad381d63b6a..05a1fc1780f2 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -26,6 +26,8 @@ struct kvm_mmu_arch { struct pte_masks pte_masks; }; +struct kvm_mmu; + struct kvm_vm_arch { vm_vaddr_t gdt; vm_vaddr_t tss; @@ -35,6 +37,8 @@ struct kvm_vm_arch { uint64_t s_bit; int sev_fd; bool is_pt_protected; + + struct kvm_mmu *tdp_mmu; }; static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 55dac84cd4a7..0164ef090787 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1459,6 +1459,9 @@ enum pg_level { #define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) #define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) +void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, + struct pte_masks *pte_masks); + void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 04b8231d032a..1fd83c23529a 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -520,13 +520,11 @@ struct vmx_pages { uint64_t vmwrite_gpa; void *vmwrite; - void *eptp_hva; - uint64_t eptp_gpa; - void *eptp; - void *apic_access_hva; uint64_t apic_access_gpa; void *apic_access; + + uint64_t eptp_gpa; }; union vmx_basic { @@ -568,7 +566,7 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx, void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); +void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 1928b00bde51..00f7f11e5f0e 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -59,12 +59,10 @@ uint64_t memstress_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm); - /* * Identity map the first 4G and the test region with 1G pages so that * KVM can shadow the EPT12 with the maximum huge page size supported @@ -79,7 +77,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { - struct vmx_pages *vmx, *vmx0 = NULL; + struct vmx_pages *vmx; struct kvm_regs regs; vm_vaddr_t vmx_gva; int vcpu_id; @@ -87,18 +85,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_cpu_has_ept()); + vm_enable_ept(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vmx = vcpu_alloc_vmx(vm, &vmx_gva); - if (vcpu_id == 0) { - memstress_setup_ept(vmx, vm); - vmx0 = vmx; - } else { - /* Share the same EPT table across all vCPUs. */ - vmx->eptp = vmx0->eptp; - vmx->eptp_hva = vmx0->eptp_hva; - vmx->eptp_gpa = vmx0->eptp_gpa; - } + /* The EPTs are shared across vCPUs, setup the mappings once */ + if (vcpu_id == 0) + memstress_setup_ept_mappings(vmx, vm); /* * Override the vCPU to run memstress_l1_guest_code() which will diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 3800f4ff6770..8a9298a72897 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -187,6 +187,15 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) virt_mmu_init(vm, &vm->mmu, &pte_masks); } +void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, + struct pte_masks *pte_masks) +{ + TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized"); + + vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu)); + virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks); +} + static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t *parent_pte, uint64_t vaddr, int level) { diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index a3e2eae981da..9d4e391fdf2c 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -56,6 +56,21 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) return evmcs_ver; } +void vm_enable_ept(struct kvm_vm *vm) +{ + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); + if (vm->arch.tdp_mmu) + return; + + /* TODO: Drop eptPageTableEntry in favor of PTE masks. */ + struct pte_masks pte_masks = (struct pte_masks) { + + }; + + /* TODO: Add support for 5-level EPT. */ + tdp_mmu_init(vm, 4, &pte_masks); +} + /* Allocate memory regions for nested VMX tests. * * Input Args: @@ -105,6 +120,9 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); + if (vm->arch.tdp_mmu) + vmx->eptp_gpa = vm->arch.tdp_mmu->pgd; + *p_vmx_gva = vmx_gva; return vmx; } @@ -395,7 +413,8 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); - struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; + void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); + struct eptPageTableEntry *pt = eptp_hva, *pte; uint16_t index; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -525,15 +544,6 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) -{ - TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); - - vmx->eptp = (void *)vm_vaddr_alloc_page(vm); - vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); - vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); -} - void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) { vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index e7d0c08ba29d..5c8cf8ac42a2 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -93,6 +93,9 @@ static void test_vmx_dirty_log(bool enable_ept) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (enable_ept) + vm_enable_ept(vm); + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); @@ -113,14 +116,10 @@ static void test_vmx_dirty_log(bool enable_ept) * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to * 0xc0000000. * - * Note that prepare_eptp should be called only L1's GPA map is done, - * meaning after the last call to virt_map. - * * When EPT is disabled, the L2 guest code will still access the same L1 * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm); tdp_identity_map_default_memslots(vmx, vm); tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); -- cgit v1.2.3 From e40e72fec0dea9ac55aea84a0d76ccb7d7f32204 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:40 -0800 Subject: KVM: selftests: Stop passing VMX metadata to TDP mapping functions The root GPA is now retrieved from the nested MMU, stop passing VMX metadata. This is in preparation for making these functions work for NPTs as well. Opportunistically drop tdp_pg_map() since it's unused. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-12-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 11 ++------ tools/testing/selftests/kvm/lib/x86/memstress.c | 11 ++++---- tools/testing/selftests/kvm/lib/x86/vmx.c | 33 ++++++++-------------- .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 9 +++--- 4 files changed, 24 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 1fd83c23529a..4dd4c2094ee6 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -557,14 +557,9 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, - uint64_t paddr); -void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, - uint64_t paddr, uint64_t size); -void tdp_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm); -void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size); +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct kvm_vm *vm); +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 00f7f11e5f0e..3319cb57a78d 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -59,7 +59,7 @@ uint64_t memstress_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm) +static void memstress_setup_ept_mappings(struct kvm_vm *vm) { uint64_t start, end; @@ -68,16 +68,15 @@ static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *v * KVM can shadow the EPT12 with the maximum huge page size supported * by the backing source. */ - tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + tdp_identity_map_1g(vm, 0, 0x100000000ULL); start = align_down(memstress_args.gpa, PG_SIZE_1G); end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); - tdp_identity_map_1g(vmx, vm, start, end - start); + tdp_identity_map_1g(vm, start, end - start); } void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { - struct vmx_pages *vmx; struct kvm_regs regs; vm_vaddr_t vmx_gva; int vcpu_id; @@ -87,11 +86,11 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vm_enable_ept(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vmx = vcpu_alloc_vmx(vm, &vmx_gva); + vcpu_alloc_vmx(vm, &vmx_gva); /* The EPTs are shared across vCPUs, setup the mappings once */ if (vcpu_id == 0) - memstress_setup_ept_mappings(vmx, vm); + memstress_setup_ept_mappings(vm); /* * Override the vCPU to run memstress_l1_guest_code() which will diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 9d4e391fdf2c..ea1c09f9e8ab 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -409,8 +409,8 @@ static void tdp_create_pte(struct kvm_vm *vm, } -void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, int target_level) +void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); @@ -453,12 +453,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, } } -void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) -{ - __tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); -} - /* * Map a range of EPT guest physical addresses to the VM's physical address * @@ -476,9 +470,8 @@ void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - int level) +void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size, int level) { size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; @@ -487,23 +480,22 @@ void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __tdp_pg_map(vmx, vm, nested_paddr, paddr, level); + __tdp_pg_map(vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } -void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size) { - __tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); + __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); } /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void tdp_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm) +void tdp_identity_map_default_memslots(struct kvm_vm *vm) { uint32_t s, memslot = 0; sparsebit_idx_t i, last; @@ -520,16 +512,15 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx, if (i > last) break; - tdp_map(vmx, vm, (uint64_t)i << vm->page_shift, + tdp_map(vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, 1 << vm->page_shift); } } /* Identity map a region with 1GiB Pages. */ -void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size) +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) { - __tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); + __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 5c8cf8ac42a2..370f8d3117c2 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -80,7 +80,6 @@ void l1_guest_code(struct vmx_pages *vmx) static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; - struct vmx_pages *vmx; unsigned long *bmap; uint64_t *host_test_mem; @@ -96,7 +95,7 @@ static void test_vmx_dirty_log(bool enable_ept) if (enable_ept) vm_enable_ept(vm); - vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); /* Add an extra memory slot for testing dirty logging */ @@ -120,9 +119,9 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - tdp_identity_map_default_memslots(vmx, vm); - tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_identity_map_default_memslots(vm); + tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); -- cgit v1.2.3 From 8296b16c0a2ba018c3235db5325d679e603899d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:41 -0800 Subject: KVM: selftests: Add a stage-2 MMU instance to kvm_vm Add a stage-2 MMU instance so that architectures that support nested virtualization (more specifically, nested stage-2 page tables) can create and track stage-2 page tables for running L2 guests. Plumb the structure into common code to avoid cyclical dependencies, and to provide some line of sight to having common APIs for creating stage-2 mappings. As a bonus, putting the member in common code justifies using stage2_mmu instead of tdp_mmu for x86. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-13-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c1497515fa6a..371d55e0366e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -116,7 +116,12 @@ struct kvm_vm { uint32_t dirty_ring_size; uint64_t gpa_tag_mask; + /* + * "mmu" is the guest's stage-1, with a short name because the vast + * majority of tests only care about the stage-1 MMU. + */ struct kvm_mmu mmu; + struct kvm_mmu stage2_mmu; struct kvm_vm_arch arch; -- cgit v1.2.3 From 508d1cc3ca0ac428b1d5d614519bc497868c2e9f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:42 -0800 Subject: KVM: selftests: Reuse virt mapping functions for nested EPTs Rework tdp_map() and friends to use __virt_pg_map() and drop the custom EPT code in __tdp_pg_map() and tdp_create_pte(). The EPT code and __virt_pg_map() are practically identical, the main differences are: - EPT uses the EPT struct overlay instead of the PTE masks. - EPT always assumes 4-level EPTs. To reuse __virt_pg_map(), extend the PTE masks to work with EPT's RWX and X-only capabilities, and provide a tdp_mmu_init() API so that EPT can pass in the EPT PTE masks along with the root page level (which is currently hardcoded to '4'). Don't reuse KVM's insane overloading of the USER bit for EPT_R as there's no reason to multiplex bits in the selftests, e.g. selftests aren't trying to shadow guest PTEs and thus don't care about funnelling protections into a common permissions check. Another benefit of reusing the code is having separate handling for upper-level PTEs vs 4K PTEs, which avoids some quirks like setting the large bit on a 4K PTE in the EPTs. For all intents and purposes, no functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251230230150.4150236-14-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 4 +- .../testing/selftests/kvm/include/x86/processor.h | 16 ++- tools/testing/selftests/kvm/lib/x86/processor.c | 21 +++- tools/testing/selftests/kvm/lib/x86/vmx.c | 119 ++++----------------- 4 files changed, 52 insertions(+), 108 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 05a1fc1780f2..1cf84b8212c6 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -14,6 +14,8 @@ struct pte_masks { uint64_t present; uint64_t writable; uint64_t user; + uint64_t readable; + uint64_t executable; uint64_t accessed; uint64_t dirty; uint64_t huge; @@ -37,8 +39,6 @@ struct kvm_vm_arch { uint64_t s_bit; int sev_fd; bool is_pt_protected; - - struct kvm_mmu *tdp_mmu; }; static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 0164ef090787..e17cbbe71b8f 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1444,6 +1444,8 @@ enum pg_level { #define PTE_PRESENT_MASK(mmu) ((mmu)->arch.pte_masks.present) #define PTE_WRITABLE_MASK(mmu) ((mmu)->arch.pte_masks.writable) #define PTE_USER_MASK(mmu) ((mmu)->arch.pte_masks.user) +#define PTE_READABLE_MASK(mmu) ((mmu)->arch.pte_masks.readable) +#define PTE_EXECUTABLE_MASK(mmu) ((mmu)->arch.pte_masks.executable) #define PTE_ACCESSED_MASK(mmu) ((mmu)->arch.pte_masks.accessed) #define PTE_DIRTY_MASK(mmu) ((mmu)->arch.pte_masks.dirty) #define PTE_HUGE_MASK(mmu) ((mmu)->arch.pte_masks.huge) @@ -1451,13 +1453,23 @@ enum pg_level { #define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) #define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) -#define is_present_pte(mmu, pte) (!!(*(pte) & PTE_PRESENT_MASK(mmu))) +/* + * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present + * if it's executable or readable, as EPT supports execute-only PTEs, but not + * write-only PTEs. + */ +#define is_present_pte(mmu, pte) \ + (PTE_PRESENT_MASK(mmu) ? \ + !!(*(pte) & PTE_PRESENT_MASK(mmu)) : \ + !!(*(pte) & (PTE_READABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu)))) +#define is_executable_pte(mmu, pte) \ + ((*(pte) & (PTE_EXECUTABLE_MASK(mmu) | PTE_NX_MASK(mmu))) == PTE_EXECUTABLE_MASK(mmu)) #define is_writable_pte(mmu, pte) (!!(*(pte) & PTE_WRITABLE_MASK(mmu))) #define is_user_pte(mmu, pte) (!!(*(pte) & PTE_USER_MASK(mmu))) #define is_accessed_pte(mmu, pte) (!!(*(pte) & PTE_ACCESSED_MASK(mmu))) #define is_dirty_pte(mmu, pte) (!!(*(pte) & PTE_DIRTY_MASK(mmu))) #define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) -#define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) +#define is_nx_pte(mmu, pte) (!is_executable_pte(mmu, pte)) void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, struct pte_masks *pte_masks); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 8a9298a72897..41316cac94e0 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -165,6 +165,10 @@ static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu, mmu->pgd_created = true; mmu->arch.pte_masks = *pte_masks; } + + TEST_ASSERT(mmu->pgtable_levels == 4 || mmu->pgtable_levels == 5, + "Selftests MMU only supports 4-level and 5-level paging, not %u-level paging", + mmu->pgtable_levels); } void virt_arch_pgd_alloc(struct kvm_vm *vm) @@ -180,6 +184,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) .dirty = BIT_ULL(6), .huge = BIT_ULL(7), .nx = BIT_ULL(63), + .executable = 0, .c = vm->arch.c_bit, .s = vm->arch.s_bit, }; @@ -190,10 +195,10 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, struct pte_masks *pte_masks) { - TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized"); + TEST_ASSERT(!vm->stage2_mmu.pgtable_levels, "TDP MMU already initialized"); - vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu)); - virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks); + vm->stage2_mmu.pgtable_levels = pgtable_levels; + virt_mmu_init(vm, &vm->stage2_mmu, pte_masks); } static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, @@ -223,7 +228,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, paddr = vm_untag_gpa(vm, paddr); if (!is_present_pte(mmu, pte)) { - *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu); + *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu); if (current_level == target_level) *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else @@ -269,6 +275,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr, "Unexpected bits in paddr: %lx", paddr); + TEST_ASSERT(!PTE_EXECUTABLE_MASK(mmu) || !PTE_NX_MASK(mmu), + "X and NX bit masks cannot be used simultaneously"); + /* * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. @@ -286,7 +295,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!is_present_pte(mmu, pte), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); - *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); + *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | + (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index ea1c09f9e8ab..e3737b3d9120 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -25,21 +25,6 @@ bool enable_evmcs; struct hv_enlightened_vmcs *current_evmcs; struct hv_vp_assist_page *current_vp_assist; -struct eptPageTableEntry { - uint64_t readable:1; - uint64_t writable:1; - uint64_t executable:1; - uint64_t memory_type:3; - uint64_t ignore_pat:1; - uint64_t page_size:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t ignored_11_10:2; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t suppress_ve:1; -}; - int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; @@ -58,13 +43,24 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) void vm_enable_ept(struct kvm_vm *vm) { - TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); - if (vm->arch.tdp_mmu) - return; + struct pte_masks pte_masks; - /* TODO: Drop eptPageTableEntry in favor of PTE masks. */ - struct pte_masks pte_masks = (struct pte_masks) { + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); + /* + * EPTs do not have 'present' or 'user' bits, instead bit 0 is the + * 'readable' bit. + */ + pte_masks = (struct pte_masks) { + .present = 0, + .user = 0, + .readable = BIT_ULL(0), + .writable = BIT_ULL(1), + .executable = BIT_ULL(2), + .huge = BIT_ULL(7), + .accessed = BIT_ULL(8), + .dirty = BIT_ULL(9), + .nx = 0, }; /* TODO: Add support for 5-level EPT. */ @@ -120,8 +116,8 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); - if (vm->arch.tdp_mmu) - vmx->eptp_gpa = vm->arch.tdp_mmu->pgd; + if (vm->stage2_mmu.pgd_created) + vmx->eptp_gpa = vm->stage2_mmu.pgd; *p_vmx_gva = vmx_gva; return vmx; @@ -377,82 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -static void tdp_create_pte(struct kvm_vm *vm, - struct eptPageTableEntry *pte, - uint64_t nested_paddr, - uint64_t paddr, - int current_level, - int target_level) -{ - if (!pte->readable) { - pte->writable = true; - pte->readable = true; - pte->executable = true; - pte->page_size = (current_level == target_level); - if (pte->page_size) - pte->address = paddr >> vm->page_shift; - else - pte->address = vm_alloc_page_table(vm) >> vm->page_shift; - } else { - /* - * Entry already present. Assert that the caller doesn't want - * a hugepage at this level, and that there isn't a hugepage at - * this level. - */ - TEST_ASSERT(current_level != target_level, - "Cannot create hugepage at level: %u, nested_paddr: 0x%lx", - current_level, nested_paddr); - TEST_ASSERT(!pte->page_size, - "Cannot create page table at level: %u, nested_paddr: 0x%lx", - current_level, nested_paddr); - } -} - - -void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - int target_level) -{ - const uint64_t page_size = PG_LEVEL_SIZE(target_level); - void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); - struct eptPageTableEntry *pt = eptp_hva, *pte; - uint16_t index; - - TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, - "Unknown or unsupported guest mode: 0x%x", vm->mode); - - TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", - nested_paddr); - TEST_ASSERT((nested_paddr % page_size) == 0, - "Nested physical address not on page boundary,\n" - " nested_paddr: 0x%lx page_size: 0x%lx", - nested_paddr, page_size); - TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - TEST_ASSERT((paddr % page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx page_size: 0x%lx", - paddr, page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - - for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) { - index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - pte = &pt[index]; - - tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level); - - if (pte->page_size) - break; - - pt = addr_gpa2hva(vm, pte->address * vm->page_size); - } -} - /* * Map a range of EPT guest physical addresses to the VM's physical address * @@ -473,6 +393,7 @@ void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { + struct kvm_mmu *mmu = &vm->stage2_mmu; size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; @@ -480,7 +401,7 @@ void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __tdp_pg_map(vm, nested_paddr, paddr, level); + __virt_pg_map(vm, mmu, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } -- cgit v1.2.3 From 07676c04bd753f32a9fe2f247b0ba2d213dd7a99 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:43 -0800 Subject: KVM: selftests: Move TDP mapping functions outside of vmx.c Now that the functions are no longer VMX-specific, move them to processor.c. Do a minor comment tweak replacing 'EPT' with 'TDP'. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-15-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 4 ++ tools/testing/selftests/kvm/include/x86/vmx.h | 3 - tools/testing/selftests/kvm/lib/x86/processor.c | 53 ++++++++++++++++ tools/testing/selftests/kvm/lib/x86/vmx.c | 71 ---------------------- 4 files changed, 57 insertions(+), 74 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index e17cbbe71b8f..461cf155b96e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1479,6 +1479,10 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct kvm_vm *vm); +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 4dd4c2094ee6..92b918700d24 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -557,9 +557,6 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void tdp_identity_map_default_memslots(struct kvm_vm *vm); -void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 41316cac94e0..29e7d172f945 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -472,6 +472,59 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } +void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size, int level) +{ + size_t page_size = PG_LEVEL_SIZE(level); + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + __virt_pg_map(vm, &vm->stage2_mmu, nested_paddr, paddr, level); + nested_paddr += page_size; + paddr += page_size; + } +} + +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size) +{ + __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); +} + +/* Prepare an identity extended page table that maps all the + * physical pages in VM. + */ +void tdp_identity_map_default_memslots(struct kvm_vm *vm) +{ + uint32_t s, memslot = 0; + sparsebit_idx_t i, last; + struct userspace_mem_region *region = memslot2region(vm, memslot); + + /* Only memslot 0 is mapped here, ensure it's the only one being used */ + for (s = 0; s < NR_MEM_REGIONS; s++) + TEST_ASSERT_EQ(vm->memslots[s], 0); + + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; + last = i + (region->region.memory_size >> vm->page_shift); + for (;;) { + i = sparsebit_next_clear(region->unused_phy_pages, i); + if (i > last) + break; + + tdp_map(vm, (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, 1 << vm->page_shift); + } +} + +/* Identity map a region with 1GiB Pages. */ +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) +{ + __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); +} + /* * Set Unusable Segment * diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index e3737b3d9120..448a63457467 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -373,77 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -/* - * Map a range of EPT guest physical addresses to the VM's physical address - * - * Input Args: - * vm - Virtual Machine - * nested_paddr - Nested guest physical address to map - * paddr - VM Physical Address - * size - The size of the range to map - * level - The level at which to map the range - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a nested guest translation for the - * page range starting at nested_paddr to the page range starting at paddr. - */ -void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - uint64_t size, int level) -{ - struct kvm_mmu *mmu = &vm->stage2_mmu; - size_t page_size = PG_LEVEL_SIZE(level); - size_t npages = size / page_size; - - TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); - TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); - - while (npages--) { - __virt_pg_map(vm, mmu, nested_paddr, paddr, level); - nested_paddr += page_size; - paddr += page_size; - } -} - -void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - uint64_t size) -{ - __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); -} - -/* Prepare an identity extended page table that maps all the - * physical pages in VM. - */ -void tdp_identity_map_default_memslots(struct kvm_vm *vm) -{ - uint32_t s, memslot = 0; - sparsebit_idx_t i, last; - struct userspace_mem_region *region = memslot2region(vm, memslot); - - /* Only memslot 0 is mapped here, ensure it's the only one being used */ - for (s = 0; s < NR_MEM_REGIONS; s++) - TEST_ASSERT_EQ(vm->memslots[s], 0); - - i = (region->region.guest_phys_addr >> vm->page_shift) - 1; - last = i + (region->region.memory_size >> vm->page_shift); - for (;;) { - i = sparsebit_next_clear(region->unused_phy_pages, i); - if (i > last) - break; - - tdp_map(vm, (uint64_t)i << vm->page_shift, - (uint64_t)i << vm->page_shift, 1 << vm->page_shift); - } -} - -/* Identity map a region with 1GiB Pages. */ -void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) -{ - __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); -} - bool kvm_cpu_has_ept(void) { uint64_t ctrl; -- cgit v1.2.3 From 9cb1944f6bf09ecebcc7609f35178b85aa26f165 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:44 -0800 Subject: KVM: selftests: Allow kvm_cpu_has_ept() to be called on AMD CPUs In preparation for generalizing the nested dirty logging test, checking if either EPT or NPT is enabled will be needed. To avoid needing to gate the kvm_cpu_has_ept() call by the CPU type, make sure the function returns false if VMX is not available instead of trying to read VMX-only MSRs. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-16-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 448a63457467..c87b340362a9 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -377,6 +377,9 @@ bool kvm_cpu_has_ept(void) { uint64_t ctrl; + if (!kvm_cpu_has(X86_FEATURE_VMX)) + return false; + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) return false; -- cgit v1.2.3 From 753c0d5a507b939d6efc60c7b437d5330880cce3 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:45 -0800 Subject: KVM: selftests: Add support for nested NPTs Implement nCR3 and NPT initialization functions, similar to the EPT equivalents, and create common TDP helpers for enablement checking and initialization. Enable NPT for nested guests by default if the TDP MMU was initialized, similar to VMX. Reuse the PTE masks from the main MMU in the NPT MMU, except for the C and S bits related to confidential VMs. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-17-seanjc@google.com [sean: apply Yosry's fixup for ncr3_gpa] Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 2 ++ tools/testing/selftests/kvm/include/x86/svm_util.h | 9 ++++++++ tools/testing/selftests/kvm/lib/x86/memstress.c | 4 ++-- tools/testing/selftests/kvm/lib/x86/processor.c | 15 ++++++++++++++ tools/testing/selftests/kvm/lib/x86/svm.c | 24 ++++++++++++++++++++++ .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 4 ++-- 6 files changed, 54 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 461cf155b96e..115bec5eb1eb 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1479,6 +1479,8 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); +void vm_enable_tdp(struct kvm_vm *vm); +bool kvm_cpu_has_tdp(void); void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h index b74c6dcddcbd..5d7c42534bc4 100644 --- a/tools/testing/selftests/kvm/include/x86/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86/svm_util.h @@ -27,6 +27,9 @@ struct svm_test_data { void *msr; /* gva */ void *msr_hva; uint64_t msr_gpa; + + /* NPT */ + uint64_t ncr3_gpa; }; static inline void vmmcall(void) @@ -57,6 +60,12 @@ struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); +static inline bool kvm_cpu_has_npt(void) +{ + return kvm_cpu_has(X86_FEATURE_NPT); +} +void vm_enable_npt(struct kvm_vm *vm); + int open_sev_dev_path_or_exit(void); #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 3319cb57a78d..407abfc34909 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -82,9 +82,9 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc int vcpu_id; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + TEST_REQUIRE(kvm_cpu_has_tdp()); - vm_enable_ept(vm); + vm_enable_tdp(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vcpu_alloc_vmx(vm, &vmx_gva); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 29e7d172f945..a3a4c9a4cbcb 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -8,7 +8,9 @@ #include "kvm_util.h" #include "pmu.h" #include "processor.h" +#include "svm_util.h" #include "sev.h" +#include "vmx.h" #ifndef NUM_INTERRUPTS #define NUM_INTERRUPTS 256 @@ -472,6 +474,19 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } +void vm_enable_tdp(struct kvm_vm *vm) +{ + if (kvm_cpu_has(X86_FEATURE_VMX)) + vm_enable_ept(vm); + else + vm_enable_npt(vm); +} + +bool kvm_cpu_has_tdp(void) +{ + return kvm_cpu_has_ept() || kvm_cpu_has_npt(); +} + void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index d239c2097391..a25a3471f5f6 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -46,6 +46,9 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); memset(svm->msr_hva, 0, getpagesize()); + if (vm->stage2_mmu.pgd_created) + svm->ncr3_gpa = vm->stage2_mmu.pgd; + *p_svm_gva = svm_gva; return svm; } @@ -59,6 +62,22 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, seg->base = base; } +void vm_enable_npt(struct kvm_vm *vm) +{ + struct pte_masks pte_masks; + + TEST_ASSERT(kvm_cpu_has_npt(), "KVM doesn't supported nested NPT"); + + /* + * NPTs use the same PTE format, but deliberately drop the C-bit as the + * per-VM shared vs. private information is only meant for stage-1. + */ + pte_masks = vm->mmu.arch.pte_masks; + pte_masks.c = 0; + + tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); +} + void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) { struct vmcb *vmcb = svm->vmcb; @@ -102,6 +121,11 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; guest_regs.rdi = (u64)svm; + + if (svm->ncr3_gpa) { + ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; + ctrl->nested_cr3 = svm->ncr3_gpa; + } } /* diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 370f8d3117c2..032ab8bf60a4 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -93,7 +93,7 @@ static void test_vmx_dirty_log(bool enable_ept) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); if (enable_ept) - vm_enable_ept(vm); + vm_enable_tdp(vm); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); @@ -170,7 +170,7 @@ int main(int argc, char *argv[]) test_vmx_dirty_log(/*enable_ept=*/false); - if (kvm_cpu_has_ept()) + if (kvm_cpu_has_tdp()) test_vmx_dirty_log(/*enable_ept=*/true); return 0; -- cgit v1.2.3 From 251e4849a79b258fd3e889ff095ba083ce301c13 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:46 -0800 Subject: KVM: selftests: Set the user bit on nested NPT PTEs According to the APM, NPT walks are treated as user accesses. In preparation for supporting NPT mappings, set the 'user' bit on NPTs by adding a mask of bits to always be set on PTEs in kvm_mmu. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-18-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/include/x86/processor.h | 1 + tools/testing/selftests/kvm/lib/x86/processor.c | 5 +++-- tools/testing/selftests/kvm/lib/x86/svm.c | 3 +++ 4 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 1cf84b8212c6..be35d26bb320 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -22,6 +22,8 @@ struct pte_masks { uint64_t nx; uint64_t c; uint64_t s; + + uint64_t always_set; }; struct kvm_mmu_arch { diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 115bec5eb1eb..995277cae94e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1452,6 +1452,7 @@ enum pg_level { #define PTE_NX_MASK(mmu) ((mmu)->arch.pte_masks.nx) #define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) #define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) +#define PTE_ALWAYS_SET_MASK(mmu) ((mmu)->arch.pte_masks.always_set) /* * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index a3a4c9a4cbcb..5a3385d48902 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -231,7 +231,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, if (!is_present_pte(mmu, pte)) { *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | - PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu); + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | + PTE_ALWAYS_SET_MASK(mmu); if (current_level == target_level) *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else @@ -299,7 +300,7 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | - (paddr & PHYSICAL_PAGE_MASK); + PTE_ALWAYS_SET_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index a25a3471f5f6..2e5c480c9afd 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -75,6 +75,9 @@ void vm_enable_npt(struct kvm_vm *vm) pte_masks = vm->mmu.arch.pte_masks; pte_masks.c = 0; + /* NPT walks are treated as user accesses, so set the 'user' bit. */ + pte_masks.always_set = pte_masks.user; + tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); } -- cgit v1.2.3 From 6794d916f87e0a6cd51a3d8c2c0e6ffd48fa7a79 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:47 -0800 Subject: KVM: selftests: Extend vmx_dirty_log_test to cover SVM Generalize the code in vmx_dirty_log_test.c by adding SVM-specific L1 code, doing some renaming (e.g. EPT -> TDP), and having setup code for both SVM and VMX in test_dirty_log(). Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-19-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- .../selftests/kvm/x86/nested_dirty_log_test.c | 210 +++++++++++++++++++++ .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 177 ----------------- 3 files changed, 211 insertions(+), 178 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/nested_dirty_log_test.c delete mode 100644 tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 3789890421bd..ffbf891b31d3 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -89,6 +89,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test +TEST_GEN_PROGS_x86 += x86/nested_dirty_log_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test @@ -115,7 +116,6 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c new file mode 100644 index 000000000000..89d2e86a0db9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" + +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_PAGES 3 + +/* L1 guest test virtual memory offset */ +#define GUEST_TEST_MEM 0xc0000000 + +/* L2 guest test virtual memory offset */ +#define NESTED_TEST_MEM1 0xc0001000 +#define NESTED_TEST_MEM2 0xc0002000 + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(u64 *a, u64 *b) +{ + READ_ONCE(*a); + WRITE_ONCE(*a, 1); + GUEST_SYNC(true); + GUEST_SYNC(false); + + WRITE_ONCE(*b, 1); + GUEST_SYNC(true); + WRITE_ONCE(*b, 1); + GUEST_SYNC(true); + GUEST_SYNC(false); + + /* Exit to L1 and never come back. */ + vmcall(); +} + +static void l2_guest_code_tdp_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_tdp_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + +void l1_vmx_code(struct vmx_pages *vmx) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_tdp_enabled; + else + l2_rip = l2_guest_code_tdp_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + GUEST_ASSERT(!vmlaunch()); + GUEST_SYNC(false); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; + + if (svm->ncr3_gpa) + l2_rip = l2_guest_code_tdp_enabled; + else + l2_rip = l2_guest_code_tdp_disabled; + + generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_SYNC(false); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + +static void test_dirty_log(bool nested_tdp) +{ + vm_vaddr_t nested_gva = 0; + unsigned long *bmap; + uint64_t *host_test_mem; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + bool done = false; + + pr_info("Nested TDP: %s\n", nested_tdp ? "enabled" : "disabled"); + + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (nested_tdp) + vm_enable_tdp(vm); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); + + vcpu_args_set(vcpu, 1, nested_gva); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + GUEST_TEST_MEM, + TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, + KVM_MEM_LOG_DIRTY_PAGES); + + /* + * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * affects both L1 and L2. However... + */ + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); + + /* + * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to + * 0xc0000000. + * + * When TDP is disabled, the L2 guest code will still access the same L1 + * GPAs as the TDP enabled case. + */ + if (nested_tdp) { + tdp_identity_map_default_memslots(vm); + tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + } + + bmap = bitmap_zalloc(TEST_MEM_PAGES); + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); + + while (!done) { + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + /* + * The nested guest wrote at offset 0x1000 in the memslot, but the + * dirty bitmap must be filled in according to L1 GPA, not L2. + */ + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + if (uc.args[1]) { + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); + } else { + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); + } + + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM)); + + test_dirty_log(/*nested_tdp=*/false); + + if (kvm_cpu_has_tdp()) + test_dirty_log(/*nested_tdp=*/true); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c deleted file mode 100644 index 032ab8bf60a4..000000000000 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * KVM dirty page logging test - * - * Copyright (C) 2018, Red Hat, Inc. - */ -#include -#include -#include -#include - -#include "test_util.h" -#include "kvm_util.h" -#include "processor.h" -#include "vmx.h" - -/* The memory slot index to track dirty pages */ -#define TEST_MEM_SLOT_INDEX 1 -#define TEST_MEM_PAGES 3 - -/* L1 guest test virtual memory offset */ -#define GUEST_TEST_MEM 0xc0000000 - -/* L2 guest test virtual memory offset */ -#define NESTED_TEST_MEM1 0xc0001000 -#define NESTED_TEST_MEM2 0xc0002000 - -static void l2_guest_code(u64 *a, u64 *b) -{ - READ_ONCE(*a); - WRITE_ONCE(*a, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); - - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); - - /* Exit to L1 and never come back. */ - vmcall(); -} - -static void l2_guest_code_ept_enabled(void) -{ - l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); -} - -static void l2_guest_code_ept_disabled(void) -{ - /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ - l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); -} - -void l1_guest_code(struct vmx_pages *vmx) -{ -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - void *l2_rip; - - GUEST_ASSERT(vmx->vmcs_gpa); - GUEST_ASSERT(prepare_for_vmx_operation(vmx)); - GUEST_ASSERT(load_vmcs(vmx)); - - if (vmx->eptp_gpa) - l2_rip = l2_guest_code_ept_enabled; - else - l2_rip = l2_guest_code_ept_disabled; - - prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - - GUEST_SYNC(false); - GUEST_ASSERT(!vmlaunch()); - GUEST_SYNC(false); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_DONE(); -} - -static void test_vmx_dirty_log(bool enable_ept) -{ - vm_vaddr_t vmx_pages_gva = 0; - unsigned long *bmap; - uint64_t *host_test_mem; - - struct kvm_vcpu *vcpu; - struct kvm_vm *vm; - struct ucall uc; - bool done = false; - - pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); - - /* Create VM */ - vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - if (enable_ept) - vm_enable_tdp(vm); - - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); - - /* Add an extra memory slot for testing dirty logging */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - GUEST_TEST_MEM, - TEST_MEM_SLOT_INDEX, - TEST_MEM_PAGES, - KVM_MEM_LOG_DIRTY_PAGES); - - /* - * Add an identity map for GVA range [0xc0000000, 0xc0002000). This - * affects both L1 and L2. However... - */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); - - /* - * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to - * 0xc0000000. - * - * When EPT is disabled, the L2 guest code will still access the same L1 - * GPAs as the EPT enabled case. - */ - if (enable_ept) { - tdp_identity_map_default_memslots(vm); - tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); - } - - bmap = bitmap_zalloc(TEST_MEM_PAGES); - host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); - - while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - - switch (get_ucall(vcpu, &uc)) { - case UCALL_ABORT: - REPORT_GUEST_ASSERT(uc); - /* NOT REACHED */ - case UCALL_SYNC: - /* - * The nested guest wrote at offset 0x1000 in the memslot, but the - * dirty bitmap must be filled in according to L1 GPA, not L2. - */ - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); - if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); - } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); - } - - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); - break; - case UCALL_DONE: - done = true; - break; - default: - TEST_FAIL("Unknown ucall %lu", uc.cmd); - } - } -} - -int main(int argc, char *argv[]) -{ - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - - test_vmx_dirty_log(/*enable_ept=*/false); - - if (kvm_cpu_has_tdp()) - test_vmx_dirty_log(/*enable_ept=*/true); - - return 0; -} -- cgit v1.2.3 From 59eef1a47b8c264d09ee84c909a1da60b4e70bd7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:48 -0800 Subject: KVM: selftests: Extend memstress to run on nested SVM Add L1 SVM code and generalize the setup code to work for both VMX and SVM. This allows running 'dirty_log_perf_test -n' on AMD CPUs. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-20-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/memstress.c | 42 ++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 407abfc34909..86f4c5e4c430 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "memstress.h" #include "processor.h" +#include "svm_util.h" #include "vmx.h" void memstress_l2_guest_code(uint64_t vcpu_id) @@ -29,9 +30,10 @@ __asm__( " ud2;" ); -static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) -{ #define L2_GUEST_STACK_SIZE 64 + +static void l1_vmx_code(struct vmx_pages *vmx, uint64_t vcpu_id) +{ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; unsigned long *rsp; @@ -45,10 +47,34 @@ static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) prepare_vmcs(vmx, memstress_l2_guest_entry, rsp); GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm, uint64_t vcpu_id) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long *rsp; + + + rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; + *rsp = vcpu_id; + generic_svm_setup(svm, memstress_l2_guest_entry, rsp); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); GUEST_DONE(); } + +static void memstress_l1_guest_code(void *data, uint64_t vcpu_id) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data, vcpu_id); + else + l1_svm_code(data, vcpu_id); +} + uint64_t memstress_nested_pages(int nr_vcpus) { /* @@ -78,15 +104,17 @@ static void memstress_setup_ept_mappings(struct kvm_vm *vm) void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { struct kvm_regs regs; - vm_vaddr_t vmx_gva; + vm_vaddr_t nested_gva; int vcpu_id; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_cpu_has_tdp()); vm_enable_tdp(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vcpu_alloc_vmx(vm, &vmx_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); /* The EPTs are shared across vCPUs, setup the mappings once */ if (vcpu_id == 0) @@ -99,6 +127,6 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vcpu_regs_get(vcpus[vcpu_id], ®s); regs.rip = (unsigned long) memstress_l1_guest_code; vcpu_regs_set(vcpus[vcpu_id], ®s); - vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); + vcpu_args_set(vcpus[vcpu_id], 2, nested_gva, vcpu_id); } } -- cgit v1.2.3 From e353850499c717f1f984f7c208f49a8618beff2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:49 -0800 Subject: KVM: selftests: Rename vm_get_page_table_entry() to vm_get_pte() Shorten the API to get a PTE as the "PTE" acronym is ubiquitous, and the "page table entry" makes it unnecessarily difficult to quickly understand what callers are doing. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-21-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 2 +- tools/testing/selftests/kvm/lib/x86/processor.c | 2 +- tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c | 2 +- tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 995277cae94e..8f130e7d7048 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1359,7 +1359,7 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); +uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 5a3385d48902..ab869a98bbdc 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -390,7 +390,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) +uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index a3b7ce155981..c542cc4762b1 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -619,7 +619,7 @@ int main(int argc, char *argv[]) */ gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR); for (i = 0; i < NTEST_PAGES; i++) { - pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); + pte = vm_get_pte(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); diff --git a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c index fabeeaddfb3a..0e8aec568010 100644 --- a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c @@ -47,7 +47,6 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - uint64_t *pte; uint64_t *hva; uint64_t gpa; int rc; @@ -73,8 +72,7 @@ int main(int argc, char *argv[]) hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); - pte = vm_get_page_table_entry(vm, MEM_REGION_GVA); - *pte |= BIT_ULL(MAXPHYADDR); + *vm_get_pte(vm, MEM_REGION_GVA) |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); -- cgit v1.2.3 From bda6ae6f29664b659671f872a2adda3c1c2f5dd6 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:02 +0000 Subject: KVM: selftests: Use TEST_ASSERT_EQ() in test_vmx_nested_state() The assert messages do not add much value, so use TEST_ASSERT_EQ(), which also nicely displays the addresses in hex. While at it, also assert the values of state->flags. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c index 67a62a5a8895..b59a8a17084d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c @@ -241,8 +241,10 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); - TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull."); - TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull."); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); free(state); } -- cgit v1.2.3 From ca2eccb953fd33ef38701e33e660b21f7e84aa14 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:03 +0000 Subject: KVM: selftests: Extend vmx_set_nested_state_test to cover SVM Add test cases for the validation checks in svm_set_nested_state(), and allow the test to run with SVM as well as VMX. The SVM test also makes sure that KVM_SET_NESTED_STATE accepts GIF being set or cleared if EFER.SVME is cleared, verifying a recently fixed bug where GIF was incorrectly expected to always be set when EFER.SVME is cleared. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-5-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- .../selftests/kvm/x86/nested_set_state_test.c | 406 +++++++++++++++++++++ .../selftests/kvm/x86/vmx_set_nested_state_test.c | 306 ---------------- 3 files changed, 407 insertions(+), 307 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/nested_set_state_test.c delete mode 100644 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..4ddece4ee365 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_set_state_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test @@ -120,7 +121,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test -TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test diff --git a/tools/testing/selftests/kvm/x86/nested_set_state_test.c b/tools/testing/selftests/kvm/x86/nested_set_state_test.c new file mode 100644 index 000000000000..0f2102b43629 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_set_state_test.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019, Google LLC. + * + * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#include +#include +#include +#include +#include + +/* + * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value + * changes this should be updated. + */ +#define VMCS12_REVISION 0x11e57ed0 + +bool have_evmcs; + +void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) +{ + vcpu_nested_state_set(vcpu, state); +} + +void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state, + int expected_errno) +{ + int rv; + + rv = __vcpu_nested_state_set(vcpu, state); + TEST_ASSERT(rv == -1 && errno == expected_errno, + "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", + strerror(expected_errno), expected_errno, rv, strerror(errno), + errno); +} + +void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vcpu, state, EINVAL); +} + +void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vcpu, state, EFAULT); +} + +void set_revision_id_for_vmcs12(struct kvm_nested_state *state, + u32 vmcs12_revision) +{ + /* Set revision_id in vmcs12 to vmcs12_revision. */ + memcpy(&state->data, &vmcs12_revision, sizeof(u32)); +} + +void set_default_state(struct kvm_nested_state *state) +{ + memset(state, 0, sizeof(*state)); + state->flags = KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GUEST_MODE; + state->format = 0; + state->size = sizeof(*state); +} + +void set_default_vmx_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + if (have_evmcs) + state->flags = KVM_STATE_NESTED_EVMCS; + state->format = 0; + state->size = size; + state->hdr.vmx.vmxon_pa = 0x1000; + state->hdr.vmx.vmcs12_pa = 0x2000; + state->hdr.vmx.smm.flags = 0; + set_revision_id_for_vmcs12(state, VMCS12_REVISION); +} + +void test_vmx_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCS12. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + /* The format must be set to 0. 0 for VMX, 1 for SVM. */ + set_default_vmx_state(state, state_sz); + state->format = 1; + test_nested_state_expect_einval(vcpu, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. + */ + set_default_vmx_state(state, state_sz); + test_nested_state_expect_einval(vcpu, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa + * is set to -1ull, but the flags must be zero. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + test_nested_state_expect_einval(vcpu, state); + + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + /* Enable VMX in the guest CPUID. */ + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); + + /* + * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without + * setting the nested state. When the eVMCS flag is not set, the + * expected return value is '0'. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + test_nested_state(vcpu, state); + + /* + * When eVMCS is supported, the eVMCS flag can only be set if the + * enlightened VMCS capability has been enabled. + */ + if (have_evmcs) { + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + vcpu_enable_evmcs(vcpu); + test_nested_state(vcpu, state); + } + + /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ + state->hdr.vmx.smm.flags = 1; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vcpu, state); + + /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->flags = 0; + test_nested_state_expect_einval(vcpu, state); + + /* It is invalid to have vmxon_pa set to a non-page aligned address. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 1; + test_nested_state_expect_einval(vcpu, state); + + /* + * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and + * KVM_STATE_NESTED_GUEST_MODE set together. + */ + set_default_vmx_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * It is invalid to have any of the SMM flags set besides: + * KVM_STATE_NESTED_SMM_GUEST_MODE + * KVM_STATE_NESTED_SMM_VMXON + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | + KVM_STATE_NESTED_SMM_VMXON); + test_nested_state_expect_einval(vcpu, state); + + /* Outside SMM, SMM flags must be zero. */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and vmcs12 + * if VMCS12 physical address is set + */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + test_nested_state_expect_einval(vcpu, state); + + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + test_nested_state(vcpu, state); + + /* + * KVM_SET_NESTED_STATE succeeds with invalid VMCS + * contents but L2 not running. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + test_nested_state(vcpu, state); + + /* Invalid flags are rejected, even if no VMCS loaded. */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vcpu, state); + + /* vmxon_pa cannot be the same address as vmcs_pa. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 0; + state->hdr.vmx.vmcs12_pa = 0; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get + * it again. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); + + free(state); +} + +static void vcpu_efer_enable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer | EFER_SVME); +} + +static void vcpu_efer_disable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer & ~EFER_SVME); +} + +void set_default_svm_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + state->format = 1; + state->size = size; + state->hdr.svm.vmcb_pa = 0x3000; +} + +void test_svm_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCB. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* The format must be set to 1. 0 for VMX, 1 for SVM. */ + set_default_svm_state(state, state_sz); + state->format = 0; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected, KVM_STATE_NESTED_EVMCS is VMX-only */ + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + /* + * If EFER.SVME is clear, guest mode is disallowed and GIF can be set or + * cleared. + */ + vcpu_efer_disable_svm(vcpu); + + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + + /* Enable SVM in the guest EFER. */ + vcpu_efer_enable_svm(vcpu); + + /* Setting vmcb_pa to a non-aligned address is only fine when not entering guest mode */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and VMCB + * only when entering guest mode. + */ + set_default_svm_state(state, state_sz/2); + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get it + * again, except for vmcb_pa, which is always returned as 0 when not in + * guest mode. + */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.svm.vmcb_pa, 0); + TEST_ASSERT_EQ(state->flags, KVM_STATE_NESTED_GIF_SET); + + free(state); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_nested_state state; + struct kvm_vcpu *vcpu; + + have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + /* + * First run tests with VMX/SVM disabled to check error handling. + * test_{vmx/svm}_nested_state() will re-enable as needed. + */ + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); + else + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* Passing a NULL kvm_nested_state causes a EFAULT. */ + test_nested_state_expect_efault(vcpu, NULL); + + /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ + set_default_state(&state); + state.size = 0; + test_nested_state_expect_einval(vcpu, &state); + + /* + * Setting the flags 0xf fails the flags check. The only flags that + * can be used are: + * KVM_STATE_NESTED_GUEST_MODE + * KVM_STATE_NESTED_RUN_PENDING + * KVM_STATE_NESTED_EVMCS + */ + set_default_state(&state); + state.flags = 0xf; + test_nested_state_expect_einval(vcpu, &state); + + /* + * If KVM_STATE_NESTED_RUN_PENDING is set then + * KVM_STATE_NESTED_GUEST_MODE has to be set as well. + */ + set_default_state(&state); + state.flags = KVM_STATE_NESTED_RUN_PENDING; + test_nested_state_expect_einval(vcpu, &state); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + test_vmx_nested_state(vcpu); + else + test_svm_nested_state(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c deleted file mode 100644 index b59a8a17084d..000000000000 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ /dev/null @@ -1,306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * vmx_set_nested_state_test - * - * Copyright (C) 2019, Google LLC. - * - * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. - */ - -#include "test_util.h" -#include "kvm_util.h" -#include "processor.h" -#include "vmx.h" - -#include -#include -#include -#include -#include - -/* - * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value - * changes this should be updated. - */ -#define VMCS12_REVISION 0x11e57ed0 - -bool have_evmcs; - -void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) -{ - vcpu_nested_state_set(vcpu, state); -} - -void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state, - int expected_errno) -{ - int rv; - - rv = __vcpu_nested_state_set(vcpu, state); - TEST_ASSERT(rv == -1 && errno == expected_errno, - "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", - strerror(expected_errno), expected_errno, rv, strerror(errno), - errno); -} - -void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - test_nested_state_expect_errno(vcpu, state, EINVAL); -} - -void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - test_nested_state_expect_errno(vcpu, state, EFAULT); -} - -void set_revision_id_for_vmcs12(struct kvm_nested_state *state, - u32 vmcs12_revision) -{ - /* Set revision_id in vmcs12 to vmcs12_revision. */ - memcpy(&state->data, &vmcs12_revision, sizeof(u32)); -} - -void set_default_state(struct kvm_nested_state *state) -{ - memset(state, 0, sizeof(*state)); - state->flags = KVM_STATE_NESTED_RUN_PENDING | - KVM_STATE_NESTED_GUEST_MODE; - state->format = 0; - state->size = sizeof(*state); -} - -void set_default_vmx_state(struct kvm_nested_state *state, int size) -{ - memset(state, 0, size); - if (have_evmcs) - state->flags = KVM_STATE_NESTED_EVMCS; - state->format = 0; - state->size = size; - state->hdr.vmx.vmxon_pa = 0x1000; - state->hdr.vmx.vmcs12_pa = 0x2000; - state->hdr.vmx.smm.flags = 0; - set_revision_id_for_vmcs12(state, VMCS12_REVISION); -} - -void test_vmx_nested_state(struct kvm_vcpu *vcpu) -{ - /* Add a page for VMCS12. */ - const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); - struct kvm_nested_state *state = - (struct kvm_nested_state *)malloc(state_sz); - - /* The format must be set to 0. 0 for VMX, 1 for SVM. */ - set_default_vmx_state(state, state_sz); - state->format = 1; - test_nested_state_expect_einval(vcpu, state); - - /* - * We cannot virtualize anything if the guest does not have VMX - * enabled. - */ - set_default_vmx_state(state, state_sz); - test_nested_state_expect_einval(vcpu, state); - - /* - * We cannot virtualize anything if the guest does not have VMX - * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa - * is set to -1ull, but the flags must be zero. - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - test_nested_state_expect_einval(vcpu, state); - - state->hdr.vmx.vmcs12_pa = -1ull; - state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vcpu, state); - - state->flags = 0; - test_nested_state(vcpu, state); - - /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); - - /* - * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without - * setting the nested state. When the eVMCS flag is not set, the - * expected return value is '0'. - */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - state->hdr.vmx.vmxon_pa = -1ull; - state->hdr.vmx.vmcs12_pa = -1ull; - test_nested_state(vcpu, state); - - /* - * When eVMCS is supported, the eVMCS flag can only be set if the - * enlightened VMCS capability has been enabled. - */ - if (have_evmcs) { - state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vcpu, state); - vcpu_enable_evmcs(vcpu); - test_nested_state(vcpu, state); - } - - /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ - state->hdr.vmx.smm.flags = 1; - test_nested_state_expect_einval(vcpu, state); - - /* Invalid flags are rejected. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vcpu, state); - - /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - state->flags = 0; - test_nested_state_expect_einval(vcpu, state); - - /* It is invalid to have vmxon_pa set to a non-page aligned address. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 1; - test_nested_state_expect_einval(vcpu, state); - - /* - * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and - * KVM_STATE_NESTED_GUEST_MODE set together. - */ - set_default_vmx_state(state, state_sz); - state->flags = KVM_STATE_NESTED_GUEST_MODE | - KVM_STATE_NESTED_RUN_PENDING; - state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vcpu, state); - - /* - * It is invalid to have any of the SMM flags set besides: - * KVM_STATE_NESTED_SMM_GUEST_MODE - * KVM_STATE_NESTED_SMM_VMXON - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | - KVM_STATE_NESTED_SMM_VMXON); - test_nested_state_expect_einval(vcpu, state); - - /* Outside SMM, SMM flags must be zero. */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vcpu, state); - - /* - * Size must be large enough to fit kvm_nested_state and vmcs12 - * if VMCS12 physical address is set - */ - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - test_nested_state_expect_einval(vcpu, state); - - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - state->hdr.vmx.vmcs12_pa = -1; - test_nested_state(vcpu, state); - - /* - * KVM_SET_NESTED_STATE succeeds with invalid VMCS - * contents but L2 not running. - */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - test_nested_state(vcpu, state); - - /* Invalid flags are rejected, even if no VMCS loaded. */ - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - state->hdr.vmx.vmcs12_pa = -1; - state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vcpu, state); - - /* vmxon_pa cannot be the same address as vmcs_pa. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 0; - state->hdr.vmx.vmcs12_pa = 0; - test_nested_state_expect_einval(vcpu, state); - - /* - * Test that if we leave nesting the state reflects that when we get - * it again. - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - state->hdr.vmx.vmcs12_pa = -1ull; - state->flags = 0; - test_nested_state(vcpu, state); - vcpu_nested_state_get(vcpu, state); - TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, - "Size must be between %ld and %d. The size returned was %d.", - sizeof(*state), state_sz, state->size); - - TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); - TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); - TEST_ASSERT_EQ(state->flags, 0); - - free(state); -} - -int main(int argc, char *argv[]) -{ - struct kvm_vm *vm; - struct kvm_nested_state state; - struct kvm_vcpu *vcpu; - - have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); - - TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); - - /* - * AMD currently does not implement set_nested_state, so for now we - * just early out. - */ - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - - vm = vm_create_with_one_vcpu(&vcpu, NULL); - - /* - * First run tests with VMX disabled to check error handling. - */ - vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); - - /* Passing a NULL kvm_nested_state causes a EFAULT. */ - test_nested_state_expect_efault(vcpu, NULL); - - /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ - set_default_state(&state); - state.size = 0; - test_nested_state_expect_einval(vcpu, &state); - - /* - * Setting the flags 0xf fails the flags check. The only flags that - * can be used are: - * KVM_STATE_NESTED_GUEST_MODE - * KVM_STATE_NESTED_RUN_PENDING - * KVM_STATE_NESTED_EVMCS - */ - set_default_state(&state); - state.flags = 0xf; - test_nested_state_expect_einval(vcpu, &state); - - /* - * If KVM_STATE_NESTED_RUN_PENDING is set then - * KVM_STATE_NESTED_GUEST_MODE has to be set as well. - */ - set_default_state(&state); - state.flags = KVM_STATE_NESTED_RUN_PENDING; - test_nested_state_expect_einval(vcpu, &state); - - test_vmx_nested_state(vcpu); - - kvm_vm_free(vm); - return 0; -} -- cgit v1.2.3 From 58e3e5265484a1bf39569903630a45a924621aaa Mon Sep 17 00:00:00 2001 From: Shengming Hu Date: Mon, 29 Dec 2025 21:52:27 +0800 Subject: memblock: drop redundant 'struct page *' argument from memblock_free_pages() memblock_free_pages() currently takes both a struct page * and the corresponding PFN. The page pointer is always derived from the PFN at call sites (pfn_to_page(pfn)), making the parameter redundant and also allowing accidental mismatches between the two arguments. Simplify the interface by removing the struct page * argument and deriving the page locally from the PFN, after the deferred struct page initialization check. This keeps the behavior unchanged while making the helper harder to misuse. Signed-off-by: Shengming Hu Reviewed-by: David Hildenbrand (Red Hat) Link: https://patch.msgid.link/tencent_F741CE6ECC49EE099736685E60C0DBD4A209@qq.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/internal.h | 3 +-- mm/memblock.c | 4 ++-- mm/mm_init.c | 5 +++-- tools/testing/memblock/internal.h | 3 +-- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/mm/internal.h b/mm/internal.h index e430da900430..5f93ee1459d9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -742,8 +742,7 @@ static inline void clear_zone_contiguous(struct zone *zone) extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); -extern void memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order); +extern void memblock_free_pages(unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); diff --git a/mm/memblock.c b/mm/memblock.c index 905d06b16348..6e11f81c4870 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1771,7 +1771,7 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - memblock_free_pages(pfn_to_page(cursor), cursor, 0); + memblock_free_pages(cursor, 0); totalram_pages_inc(); } } @@ -2216,7 +2216,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - memblock_free_pages(pfn_to_page(start), start, order); + memblock_free_pages(start, order); start += (1UL << order); } diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..d5b91602ff2a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2480,9 +2480,10 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -void __init memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order) +void __init memblock_free_pages(unsigned long pfn, unsigned int order) { + struct page *page = pfn_to_page(pfn); + if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 0ab4b53bb4f3..009b97bbdd22 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -15,8 +15,7 @@ bool mirrored_kernelcore = false; struct page {}; -void memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order) +void memblock_free_pages(unsigned long pfn, unsigned int order) { } -- cgit v1.2.3 From 736a2dcfdae72483a36793bc92182f33bd61d30e Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 30 Dec 2025 12:07:31 +0100 Subject: x86/CPU/AMD: Simplify the spectral chicken fix msr_set_bit() takes a bit number to set but MSR_ZEN2_SPECTRAL_CHICKEN_BIT is a bit mask. The usual pattern that code uses is a _BIT-named type macro instead of a mask. So convert it to a bit number to reflect that. Also, msr_set_bit() already does the reading and checking whether the bit needs to be set so use that instead of a local variable. Fixup tabbing while at it. No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://patch.msgid.link/20251230110731.28108-1-bp@kernel.org --- arch/x86/include/asm/msr-index.h | 4 ++-- arch/x86/kernel/cpu/amd.c | 10 ++-------- tools/arch/x86/include/asm/msr-index.h | 4 ++-- 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d0a0950d20a..43adc38d31d5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -794,8 +794,8 @@ #define MSR_F19H_UMC_PERF_CTR 0xc0010801 /* Zen 2 */ -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT 1 /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bc94ff1e250a..ab9158c94f8c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -900,20 +900,14 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) void init_spectral_chicken(struct cpuinfo_x86 *c) { #ifdef CONFIG_MITIGATION_UNRET_ENTRY - u64 value; - /* * On Zen2 we offer this chicken (bit) on the altar of Speculation. * * This suppresses speculation from the middle of a basic block, i.e. it * suppresses non-branch predictions. */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { - if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { - value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; - wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); - } - } + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN2_SPECTRAL_CHICKEN, MSR_ZEN2_SPECTRAL_CHICKEN_BIT); #endif } diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 9e1720d73244..d4137a302793 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -770,8 +770,8 @@ #define MSR_F19H_UMC_PERF_CTR 0xc0010801 /* Zen 2 */ -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT 1 /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -- cgit v1.2.3 From 15e8d739fda1084d81f7d3813e9600eba6e0f134 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Thu, 1 Jan 2026 14:40:58 +0100 Subject: selftests/landlock: Properly close a file descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a missing close(srv_fd) call, and use EXPECT_EQ() to check the result. Signed-off-by: Günther Noack Fixes: f83d51a5bdfe ("selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets") Link: https://lore.kernel.org/r/20260101134102.25938-2-gnoack3000@gmail.com [mic: Use EXPECT_EQ() and update commit message] Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 37a5a3df712e..968a91c927a4 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -4399,7 +4399,8 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) /* FIONREAD and other IOCTLs should not be forbidden. */ EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); - ASSERT_EQ(0, close(cli_fd)); + EXPECT_EQ(0, close(cli_fd)); + EXPECT_EQ(0, close(srv_fd)); } /* clang-format off */ -- cgit v1.2.3 From 37488ae6ceff9c912dab1a7b2217c563b43f99d2 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:27 +0000 Subject: tools: ynl: pylint suppressions and docstrings Add some docstrings and suppress all the pylint warnings that won't get fixed yet: - no-name-in-module,wrong-import-position - too-many-locals - too-many-branches - too-many-statements - too-many-nested-blocks - too-many-instance-attributes - too-many-arguments - too-many-positional-arguments - too-few-public-methods - missing-class-docstring - missing-function-docstring Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-2-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 17 +++++++++++++++++ tools/net/ynl/pyynl/ethtool.py | 1 + tools/net/ynl/pyynl/lib/__init__.py | 2 ++ tools/net/ynl/pyynl/lib/nlspec.py | 7 +++++++ tools/net/ynl/pyynl/lib/ynl.py | 18 ++++++++++++++++++ 5 files changed, 45 insertions(+) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index af02a5b7e5a2..996c76be1403 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -1,6 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +""" +YNL cli tool +""" + import argparse import json import os @@ -9,6 +13,7 @@ import pprint import sys import textwrap +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlFamily, Netlink, NlError, SpecFamily @@ -16,6 +21,10 @@ sys_schema_dir='/usr/share/ynl' relative_schema_dir='../../../../Documentation/netlink' def schema_dir(): + """ + Return the effective schema directory, preferring in-tree before + system schema directory. + """ script_dir = os.path.dirname(os.path.abspath(__file__)) schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}") if not os.path.isdir(schema_dir): @@ -25,6 +34,10 @@ def schema_dir(): return schema_dir def spec_dir(): + """ + Return the effective spec directory, relative to the effective + schema directory. + """ spec_dir = schema_dir() + '/specs' if not os.path.isdir(spec_dir): raise Exception(f"Spec directory {spec_dir} does not exist") @@ -32,6 +45,7 @@ def spec_dir(): class YnlEncoder(json.JSONEncoder): + """A custom encoder for emitting JSON with ynl-specific instance types""" def default(self, obj): if isinstance(obj, bytes): return bytes.hex(obj) @@ -94,7 +108,10 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True): print_attr_list(ynl, mode_spec['attributes'], attr_set) +# pylint: disable=too-many-locals,too-many-branches,too-many-statements def main(): + """YNL cli tool""" + description = """ YNL CLI utility - a general purpose netlink utility that uses YAML specs to drive protocol encoding and decoding. diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py index fd0f6b8d54d1..40a8ba8d296f 100755 --- a/tools/net/ynl/pyynl/ethtool.py +++ b/tools/net/ynl/pyynl/ethtool.py @@ -8,6 +8,7 @@ import sys import re import os +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlFamily from cli import schema_dir, spec_dir diff --git a/tools/net/ynl/pyynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py index ec9ea00071be..c40dd788fe8a 100644 --- a/tools/net/ynl/pyynl/lib/__init__.py +++ b/tools/net/ynl/pyynl/lib/__init__.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +""" YNL library """ + from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \ SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat from .ynl import YnlFamily, Netlink, NlError diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index 85c17fe01e35..2ffeccf0b99b 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -1,4 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# pylint: disable=missing-function-docstring, too-many-instance-attributes, too-many-branches + +""" +The nlspec is a python library for parsing and using YNL netlink +specifications. +""" import collections import importlib diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 36d36eb7e3b8..27169ff8dafc 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -1,4 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# pylint: disable=missing-class-docstring, missing-function-docstring +# pylint: disable=too-many-branches, too-many-locals, too-many-instance-attributes +# pylint: disable=too-many-lines + +""" +YAML Netlink Library + +An implementation of the genetlink and raw netlink protocols. +""" from collections import namedtuple from enum import Enum @@ -22,6 +32,7 @@ from .nlspec import SpecFamily # +# pylint: disable=too-few-public-methods class Netlink: # Netlink socket SOL_NETLINK = 270 @@ -289,6 +300,7 @@ class NlMsg: return msg +# pylint: disable=too-few-public-methods class NlMsgs: def __init__(self, data): self.msgs = [] @@ -319,6 +331,7 @@ def _genl_msg_finalize(msg): return struct.pack("I", len(msg) + 4) + msg +# pylint: disable=too-many-nested-blocks def _genl_load_families(): with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock: sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1) @@ -447,6 +460,7 @@ class GenlProtocol(NetlinkProtocol): return super().msghdr_size() + 4 +# pylint: disable=too-few-public-methods class SpaceAttrs: SpecValuesPair = namedtuple('SpecValuesPair', ['spec', 'values']) @@ -555,6 +569,7 @@ class YnlFamily(SpecFamily): return self._from_string(value, attr_spec) raise e + # pylint: disable=too-many-statements def _add_attr(self, space, name, value, search_attrs): try: attr = self.attr_sets[space][name] @@ -778,6 +793,7 @@ class YnlFamily(SpecFamily): raise Exception(f"Unknown attribute-set '{msg_format.attr_set}' when decoding '{attr_spec.name}'") return decoded + # pylint: disable=too-many-statements def _decode(self, attrs, space, outer_attrs = None): rsp = dict() if space: @@ -838,6 +854,7 @@ class YnlFamily(SpecFamily): return rsp + # pylint: disable=too-many-arguments, too-many-positional-arguments def _decode_extack_path(self, attrs, attr_set, offset, target, search_attrs): for attr in attrs: try: @@ -1081,6 +1098,7 @@ class YnlFamily(SpecFamily): msg = _genl_msg_finalize(msg) return msg + # pylint: disable=too-many-statements def _ops(self, ops): reqs_by_seq = {} req_seq = random.randint(1024, 65535) -- cgit v1.2.3 From bcdd8ea73f750a4a6c38859a3f06027aa40b84c5 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:28 +0000 Subject: tools: ynl: fix pylint redefinition, encoding errors Fix pylint warnings for: - invalid-name - arguments-renamed - redefined-outer-name - unspecified-encoding - consider-using-sys-exit Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-3-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 44 +++++++++++++++---------------- tools/net/ynl/pyynl/lib/nlspec.py | 18 ++++++------- tools/net/ynl/pyynl/lib/ynl.py | 54 +++++++++++++++++++-------------------- 3 files changed, 58 insertions(+), 58 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 996c76be1403..37efa8c4f0e2 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -17,8 +17,8 @@ import textwrap sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlFamily, Netlink, NlError, SpecFamily -sys_schema_dir='/usr/share/ynl' -relative_schema_dir='../../../../Documentation/netlink' +SYS_SCHEMA_DIR='/usr/share/ynl' +RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink' def schema_dir(): """ @@ -26,32 +26,32 @@ def schema_dir(): system schema directory. """ script_dir = os.path.dirname(os.path.abspath(__file__)) - schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}") - if not os.path.isdir(schema_dir): - schema_dir = sys_schema_dir - if not os.path.isdir(schema_dir): - raise Exception(f"Schema directory {schema_dir} does not exist") - return schema_dir + schema_dir_ = os.path.abspath(f"{script_dir}/{RELATIVE_SCHEMA_DIR}") + if not os.path.isdir(schema_dir_): + schema_dir_ = SYS_SCHEMA_DIR + if not os.path.isdir(schema_dir_): + raise Exception(f"Schema directory {schema_dir_} does not exist") + return schema_dir_ def spec_dir(): """ Return the effective spec directory, relative to the effective schema directory. """ - spec_dir = schema_dir() + '/specs' - if not os.path.isdir(spec_dir): - raise Exception(f"Spec directory {spec_dir} does not exist") - return spec_dir + spec_dir_ = schema_dir() + '/specs' + if not os.path.isdir(spec_dir_): + raise Exception(f"Spec directory {spec_dir_} does not exist") + return spec_dir_ class YnlEncoder(json.JSONEncoder): """A custom encoder for emitting JSON with ynl-specific instance types""" - def default(self, obj): - if isinstance(obj, bytes): - return bytes.hex(obj) - if isinstance(obj, set): - return list(obj) - return json.JSONEncoder.default(self, obj) + def default(self, o): + if isinstance(o, bytes): + return bytes.hex(o) + if isinstance(o, set): + return list(o) + return json.JSONEncoder.default(self, o) def print_attr_list(ynl, attr_names, attr_set, indent=2): @@ -196,11 +196,11 @@ def main(): SpecFamily(spec, args.schema) except Exception as error: print(error) - exit(1) + sys.exit(1) return if args.family: # set behaviour when using installed specs - if args.schema is None and spec.startswith(sys_schema_dir): + if args.schema is None and spec.startswith(SYS_SCHEMA_DIR): args.schema = '' # disable schema validation when installed if args.process_unknown is None: args.process_unknown = True @@ -224,7 +224,7 @@ def main(): op = ynl.msgs.get(args.list_attrs) if not op: print(f'Operation {args.list_attrs} not found') - exit(1) + sys.exit(1) print(f'Operation: {op.name}') print(op.yaml['doc']) @@ -259,7 +259,7 @@ def main(): output(msg) except NlError as e: print(e) - exit(1) + sys.exit(1) except KeyboardInterrupt: pass except BrokenPipeError: diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index 2ffeccf0b99b..c3113952c417 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -10,7 +10,7 @@ specifications. import collections import importlib import os -import yaml +import yaml as pyyaml # To be loaded dynamically as needed @@ -313,11 +313,11 @@ class SpecSubMessage(SpecElement): self.formats = collections.OrderedDict() for elem in self.yaml['formats']: - format = self.new_format(family, elem) - self.formats[format.value] = format + msg_format = self.new_format(family, elem) + self.formats[msg_format.value] = msg_format - def new_format(self, family, format): - return SpecSubMessageFormat(family, format) + def new_format(self, family, msg_format): + return SpecSubMessageFormat(family, msg_format) class SpecSubMessageFormat(SpecElement): @@ -436,7 +436,7 @@ class SpecFamily(SpecElement): kernel_family dict of kernel family attributes """ def __init__(self, spec_path, schema_path=None, exclude_ops=None): - with open(spec_path, "r") as stream: + with open(spec_path, "r", encoding='utf-8') as stream: prefix = '# SPDX-License-Identifier: ' first = stream.readline().strip() if not first.startswith(prefix): @@ -444,7 +444,7 @@ class SpecFamily(SpecElement): self.license = first[len(prefix):] stream.seek(0) - spec = yaml.safe_load(stream) + spec = pyyaml.safe_load(stream) self._resolution_list = [] @@ -460,8 +460,8 @@ class SpecFamily(SpecElement): if schema_path: global jsonschema - with open(schema_path, "r") as stream: - schema = yaml.safe_load(stream) + with open(schema_path, "r", encoding='utf-8') as stream: + schema = pyyaml.safe_load(stream) if jsonschema is None: jsonschema = importlib.import_module("jsonschema") diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 27169ff8dafc..78579e495351 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -155,22 +155,22 @@ class NlAttr: @classmethod def get_format(cls, attr_type, byte_order=None): - format = cls.type_formats[attr_type] + format_ = cls.type_formats[attr_type] if byte_order: - return format.big if byte_order == "big-endian" \ - else format.little - return format.native + return format_.big if byte_order == "big-endian" \ + else format_.little + return format_.native def as_scalar(self, attr_type, byte_order=None): - format = self.get_format(attr_type, byte_order) - return format.unpack(self.raw)[0] + format_ = self.get_format(attr_type, byte_order) + return format_.unpack(self.raw)[0] def as_auto_scalar(self, attr_type, byte_order=None): if len(self.raw) != 4 and len(self.raw) != 8: raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}") real_type = attr_type[0] + str(len(self.raw) * 8) - format = self.get_format(real_type, byte_order) - return format.unpack(self.raw)[0] + format_ = self.get_format(real_type, byte_order) + return format_.unpack(self.raw)[0] def as_strz(self): return self.raw.decode('ascii')[:-1] @@ -178,9 +178,9 @@ class NlAttr: def as_bin(self): return self.raw - def as_c_array(self, type): - format = self.get_format(type) - return [ x[0] for x in format.iter_unpack(self.raw) ] + def as_c_array(self, c_type): + format_ = self.get_format(c_type) + return [ x[0] for x in format_.iter_unpack(self.raw) ] def __repr__(self): return f"[type:{self.type} len:{self._len}] {self.raw}" @@ -256,8 +256,8 @@ class NlMsg: policy = {} for attr in NlAttrs(raw): if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE: - type = attr.as_scalar('u32') - policy['type'] = Netlink.AttrType(type).name + type_ = attr.as_scalar('u32') + policy['type'] = Netlink.AttrType(type_).name elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S: policy['min-value'] = attr.as_scalar('s64') elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S: @@ -612,8 +612,8 @@ class YnlFamily(SpecFamily): elif isinstance(value, dict) and attr.struct_name: attr_payload = self._encode_struct(attr.struct_name, value) elif isinstance(value, list) and attr.sub_type in NlAttr.type_formats: - format = NlAttr.get_format(attr.sub_type) - attr_payload = b''.join([format.pack(x) for x in value]) + format_ = NlAttr.get_format(attr.sub_type) + attr_payload = b''.join([format_.pack(x) for x in value]) else: raise Exception(f'Unknown type for binary attribute, value: {value}') elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar: @@ -622,8 +622,8 @@ class YnlFamily(SpecFamily): attr_type = attr["type"][0] + ('32' if scalar.bit_length() <= 32 else '64') else: attr_type = attr["type"] - format = NlAttr.get_format(attr_type, attr.byte_order) - attr_payload = format.pack(scalar) + format_ = NlAttr.get_format(attr_type, attr.byte_order) + attr_payload = format_.pack(scalar) elif attr['type'] in "bitfield32": scalar_value = self._get_scalar(attr, value["value"]) scalar_selector = self._get_scalar(attr, value["selector"]) @@ -915,8 +915,8 @@ class YnlFamily(SpecFamily): else: size += m.len else: - format = NlAttr.get_format(m.type, m.byte_order) - size += format.size + format_ = NlAttr.get_format(m.type, m.byte_order) + size += format_.size return size else: return 0 @@ -931,17 +931,17 @@ class YnlFamily(SpecFamily): offset += m.len elif m.type == 'binary': if m.struct: - len = self._struct_size(m.struct) - value = self._decode_struct(data[offset : offset + len], + len_ = self._struct_size(m.struct) + value = self._decode_struct(data[offset : offset + len_], m.struct) - offset += len + offset += len_ else: value = data[offset : offset + m.len] offset += m.len else: - format = NlAttr.get_format(m.type, m.byte_order) - [ value ] = format.unpack_from(data, offset) - offset += format.size + format_ = NlAttr.get_format(m.type, m.byte_order) + [ value ] = format_.unpack_from(data, offset) + offset += format_.size if value is not None: if m.enum: value = self._decode_enum(value, m) @@ -970,8 +970,8 @@ class YnlFamily(SpecFamily): else: if value is None: value = 0 - format = NlAttr.get_format(m.type, m.byte_order) - attr_payload += format.pack(value) + format_ = NlAttr.get_format(m.type, m.byte_order) + attr_payload += format_.pack(value) return attr_payload def _formatted_string(self, raw, display_hint): -- cgit v1.2.3 From b6270a10b0f8727fea3005eecd9907ddaf38dc3f Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:29 +0000 Subject: tools: ynl: fix pylint exception warnings Fix pylint warnings for: - broad-exception-raised - broad-exception-caught - raise-missing-from Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-4-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 10 +++---- tools/net/ynl/pyynl/lib/__init__.py | 8 +++-- tools/net/ynl/pyynl/lib/nlspec.py | 11 +++++-- tools/net/ynl/pyynl/lib/ynl.py | 59 +++++++++++++++++++++---------------- 4 files changed, 52 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 37efa8c4f0e2..5fee45e48bbf 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -15,7 +15,7 @@ import textwrap # pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) -from lib import YnlFamily, Netlink, NlError, SpecFamily +from lib import YnlFamily, Netlink, NlError, SpecFamily, SpecException, YnlException SYS_SCHEMA_DIR='/usr/share/ynl' RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink' @@ -30,7 +30,7 @@ def schema_dir(): if not os.path.isdir(schema_dir_): schema_dir_ = SYS_SCHEMA_DIR if not os.path.isdir(schema_dir_): - raise Exception(f"Schema directory {schema_dir_} does not exist") + raise YnlException(f"Schema directory {schema_dir_} does not exist") return schema_dir_ def spec_dir(): @@ -40,7 +40,7 @@ def spec_dir(): """ spec_dir_ = schema_dir() + '/specs' if not os.path.isdir(spec_dir_): - raise Exception(f"Spec directory {spec_dir_} does not exist") + raise YnlException(f"Spec directory {spec_dir_} does not exist") return spec_dir_ @@ -189,12 +189,12 @@ def main(): else: spec = args.spec if not os.path.isfile(spec): - raise Exception(f"Spec file {spec} does not exist") + raise YnlException(f"Spec file {spec} does not exist") if args.validate: try: SpecFamily(spec, args.schema) - except Exception as error: + except SpecException as error: print(error) sys.exit(1) return diff --git a/tools/net/ynl/pyynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py index c40dd788fe8a..33a96155fb3b 100644 --- a/tools/net/ynl/pyynl/lib/__init__.py +++ b/tools/net/ynl/pyynl/lib/__init__.py @@ -3,11 +3,13 @@ """ YNL library """ from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \ - SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat -from .ynl import YnlFamily, Netlink, NlError + SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat, \ + SpecException +from .ynl import YnlFamily, Netlink, NlError, YnlException from .doc_generator import YnlDocGenerator __all__ = ["SpecAttr", "SpecAttrSet", "SpecEnumEntry", "SpecEnumSet", "SpecFamily", "SpecOperation", "SpecSubMessage", "SpecSubMessageFormat", - "YnlFamily", "Netlink", "NlError", "YnlDocGenerator"] + "SpecException", + "YnlFamily", "Netlink", "NlError", "YnlDocGenerator", "YnlException"] diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index c3113952c417..a35f827f09e3 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -17,6 +17,11 @@ import yaml as pyyaml jsonschema = None +class SpecException(Exception): + """Netlink spec exception. + """ + + class SpecElement: """Netlink spec element. @@ -385,7 +390,7 @@ class SpecOperation(SpecElement): elif self.is_resv: attr_set_name = '' else: - raise Exception(f"Can't resolve attribute set for op '{self.name}'") + raise SpecException(f"Can't resolve attribute set for op '{self.name}'") if attr_set_name: self.attr_set = self.family.attr_sets[attr_set_name] @@ -440,7 +445,7 @@ class SpecFamily(SpecElement): prefix = '# SPDX-License-Identifier: ' first = stream.readline().strip() if not first.startswith(prefix): - raise Exception('SPDX license tag required in the spec') + raise SpecException('SPDX license tag required in the spec') self.license = first[len(prefix):] stream.seek(0) @@ -555,7 +560,7 @@ class SpecFamily(SpecElement): req_val_next = req_val + 1 rsp_val_next = rsp_val + rsp_inc else: - raise Exception("Can't parse directional ops") + raise SpecException("Can't parse directional ops") if req_val == req_val_next: req_val = None diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 78579e495351..6e39618e5598 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -32,6 +32,10 @@ from .nlspec import SpecFamily # +class YnlException(Exception): + pass + + # pylint: disable=too-few-public-methods class Netlink: # Netlink socket @@ -167,7 +171,7 @@ class NlAttr: def as_auto_scalar(self, attr_type, byte_order=None): if len(self.raw) != 4 and len(self.raw) != 8: - raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}") + raise YnlException(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}") real_type = attr_type[0] + str(len(self.raw) * 8) format_ = self.get_format(real_type, byte_order) return format_.unpack(self.raw)[0] @@ -425,7 +429,7 @@ class NetlinkProtocol: def get_mcast_id(self, mcast_name, mcast_groups): if mcast_name not in mcast_groups: - raise Exception(f'Multicast group "{mcast_name}" not present in the spec') + raise YnlException(f'Multicast group "{mcast_name}" not present in the spec') return mcast_groups[mcast_name].value def msghdr_size(self): @@ -453,7 +457,7 @@ class GenlProtocol(NetlinkProtocol): def get_mcast_id(self, mcast_name, mcast_groups): if mcast_name not in self.genl_family['mcast']: - raise Exception(f'Multicast group "{mcast_name}" not present in the family') + raise YnlException(f'Multicast group "{mcast_name}" not present in the family') return self.genl_family['mcast'][mcast_name] def msghdr_size(self): @@ -475,9 +479,9 @@ class SpaceAttrs: if name in scope.values: return scope.values[name] spec_name = scope.spec.yaml['name'] - raise Exception( + raise YnlException( f"No value for '{name}' in attribute space '{spec_name}'") - raise Exception(f"Attribute '{name}' not defined in any attribute-set") + raise YnlException(f"Attribute '{name}' not defined in any attribute-set") # @@ -499,8 +503,8 @@ class YnlFamily(SpecFamily): self.yaml['protonum']) else: self.nlproto = GenlProtocol(self.yaml['name']) - except KeyError: - raise Exception(f"Family '{self.yaml['name']}' not supported by the kernel") + except KeyError as err: + raise YnlException(f"Family '{self.yaml['name']}' not supported by the kernel") from err self._recv_dbg = False # Note that netlink will use conservative (min) message size for @@ -573,8 +577,8 @@ class YnlFamily(SpecFamily): def _add_attr(self, space, name, value, search_attrs): try: attr = self.attr_sets[space][name] - except KeyError: - raise Exception(f"Space '{space}' has no attribute '{name}'") + except KeyError as err: + raise YnlException(f"Space '{space}' has no attribute '{name}'") from err nl_type = attr.value if attr.is_multi and isinstance(value, list): @@ -615,7 +619,7 @@ class YnlFamily(SpecFamily): format_ = NlAttr.get_format(attr.sub_type) attr_payload = b''.join([format_.pack(x) for x in value]) else: - raise Exception(f'Unknown type for binary attribute, value: {value}') + raise YnlException(f'Unknown type for binary attribute, value: {value}') elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar: scalar = self._get_scalar(attr, value) if attr.is_auto_scalar: @@ -641,9 +645,9 @@ class YnlFamily(SpecFamily): attr_payload += self._add_attr(msg_format.attr_set, subname, subvalue, sub_attrs) else: - raise Exception(f"Unknown attribute-set '{msg_format.attr_set}'") + raise YnlException(f"Unknown attribute-set '{msg_format.attr_set}'") else: - raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}') + raise YnlException(f'Unknown type at {space} {name} {value} {attr["type"]}') return self._add_attr_raw(nl_type, attr_payload) @@ -730,7 +734,7 @@ class YnlFamily(SpecFamily): subattr = self._formatted_string(subattr, attr_spec.display_hint) decoded.append(subattr) else: - raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') + raise YnlException(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded def _decode_nest_type_value(self, attr, attr_spec): @@ -767,13 +771,13 @@ class YnlFamily(SpecFamily): def _resolve_selector(self, attr_spec, search_attrs): sub_msg = attr_spec.sub_message if sub_msg not in self.sub_msgs: - raise Exception(f"No sub-message spec named {sub_msg} for {attr_spec.name}") + raise YnlException(f"No sub-message spec named {sub_msg} for {attr_spec.name}") sub_msg_spec = self.sub_msgs[sub_msg] selector = attr_spec.selector value = search_attrs.lookup(selector) if value not in sub_msg_spec.formats: - raise Exception(f"No message format for '{value}' in sub-message spec '{sub_msg}'") + raise YnlException(f"No message format for '{value}' in sub-message spec '{sub_msg}'") spec = sub_msg_spec.formats[value] return spec, value @@ -790,7 +794,8 @@ class YnlFamily(SpecFamily): subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set) decoded.update(subdict) else: - raise Exception(f"Unknown attribute-set '{msg_format.attr_set}' when decoding '{attr_spec.name}'") + raise YnlException(f"Unknown attribute-set '{msg_format.attr_set}' " + f"when decoding '{attr_spec.name}'") return decoded # pylint: disable=too-many-statements @@ -803,9 +808,10 @@ class YnlFamily(SpecFamily): for attr in attrs: try: attr_spec = attr_space.attrs_by_val[attr.type] - except (KeyError, UnboundLocalError): + except (KeyError, UnboundLocalError) as err: if not self.process_unknown: - raise Exception(f"Space '{space}' has no attribute with value '{attr.type}'") + raise YnlException(f"Space '{space}' has no attribute " + f"with value '{attr.type}'") from err attr_name = f"UnknownAttr({attr.type})" self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr)) continue @@ -844,7 +850,8 @@ class YnlFamily(SpecFamily): decoded = self._decode_nest_type_value(attr, attr_spec) else: if not self.process_unknown: - raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}') + raise YnlException(f'Unknown {attr_spec["type"]} ' + f'with name {attr_spec["name"]}') decoded = self._decode_unknown(attr) self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded) @@ -859,8 +866,9 @@ class YnlFamily(SpecFamily): for attr in attrs: try: attr_spec = attr_set.attrs_by_val[attr.type] - except KeyError: - raise Exception(f"Space '{attr_set.name}' has no attribute with value '{attr.type}'") + except KeyError as err: + raise YnlException( + f"Space '{attr_set.name}' has no attribute with value '{attr.type}'") from err if offset > target: break if offset == target: @@ -877,11 +885,12 @@ class YnlFamily(SpecFamily): elif attr_spec['type'] == 'sub-message': msg_format, value = self._resolve_selector(attr_spec, search_attrs) if msg_format is None: - raise Exception(f"Can't resolve sub-message of {attr_spec['name']} for extack") + raise YnlException(f"Can't resolve sub-message of " + f"{attr_spec['name']} for extack") sub_attrs = self.attr_sets[msg_format.attr_set] pathname += f"({value})" else: - raise Exception(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack") + raise YnlException(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack") offset += 4 subpath = self._decode_extack_path(NlAttrs(attr.raw), sub_attrs, offset, target, search_attrs) @@ -1008,11 +1017,11 @@ class YnlFamily(SpecFamily): mac_bytes = [int(x, 16) for x in string.split(':')] else: if len(string) % 2 != 0: - raise Exception(f"Invalid MAC address format: {string}") + raise YnlException(f"Invalid MAC address format: {string}") mac_bytes = [int(string[i:i+2], 16) for i in range(0, len(string), 2)] raw = bytes(mac_bytes) else: - raise Exception(f"Display hint '{attr_spec.display_hint}' not implemented" + raise YnlException(f"Display hint '{attr_spec.display_hint}' not implemented" f" when parsing '{attr_spec['name']}'") return raw -- cgit v1.2.3 From 04b0b64e86b7ee9923099d141f8e2ed74389c435 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:30 +0000 Subject: tools: ynl: fix pylint dict, indentation, long lines, uninitialised Fix pylint warnings for: - use-dict-literal - bad-indentation - line-too-long - possibly-used-before-assignment Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-5-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/lib/nlspec.py | 19 ++++++++++--------- tools/net/ynl/pyynl/lib/ynl.py | 37 +++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index a35f827f09e3..fcd4106d0cfa 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -129,8 +129,8 @@ class SpecEnumSet(SpecElement): prev_entry = None value_start = self.yaml.get('value-start', 0) - self.entries = dict() - self.entries_by_val = dict() + self.entries = {} + self.entries_by_val = {} for entry in self.yaml['entries']: e = self.new_entry(entry, prev_entry, value_start) self.entries[e.name] = e @@ -451,6 +451,7 @@ class SpecFamily(SpecElement): stream.seek(0) spec = pyyaml.safe_load(stream) + self.fixed_header = None self._resolution_list = [] super().__init__(self, spec) @@ -579,13 +580,13 @@ class SpecFamily(SpecElement): self.msgs[op.name] = op def find_operation(self, name): - """ - For a given operation name, find and return operation spec. - """ - for op in self.yaml['operations']['list']: - if name == op['name']: - return op - return None + """ + For a given operation name, find and return operation spec. + """ + for op in self.yaml['operations']['list']: + if name == op['name']: + return op + return None def resolve(self): self.resolve_up(super()) diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 6e39618e5598..040ff3b87c17 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -235,7 +235,7 @@ class NlMsg: self.extack = None if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off: - self.extack = dict() + self.extack = {} extack_attrs = NlAttrs(self.raw[extack_off:]) for extack in extack_attrs: if extack.type == Netlink.NLMSGERR_ATTR_MSG: @@ -296,7 +296,8 @@ class NlMsg: return self.nl_type def __repr__(self): - msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}" + msg = (f"nl_len = {self.nl_len} ({len(self.raw)}) " + f"nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}") if self.error: msg += '\n\terror: ' + str(self.error) if self.extack: @@ -361,7 +362,7 @@ def _genl_load_families(): return gm = GenlMsg(nl_msg) - fam = dict() + fam = {} for attr in NlAttrs(gm.raw): if attr.type == Netlink.CTRL_ATTR_FAMILY_ID: fam['id'] = attr.as_scalar('u16') @@ -370,7 +371,7 @@ def _genl_load_families(): elif attr.type == Netlink.CTRL_ATTR_MAXATTR: fam['maxattr'] = attr.as_scalar('u32') elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS: - fam['mcast'] = dict() + fam['mcast'] = {} for entry in NlAttrs(attr.raw): mcast_name = None mcast_id = None @@ -390,6 +391,7 @@ class GenlMsg: self.nl = nl_msg self.genl_cmd, self.genl_version, _ = struct.unpack_from("BBH", nl_msg.raw, 0) self.raw = nl_msg.raw[4:] + self.raw_attrs = [] def cmd(self): return self.genl_cmd @@ -560,8 +562,7 @@ class YnlFamily(SpecFamily): for single_value in value: scalar += enum.entries[single_value].user_value(as_flags = True) return scalar - else: - return enum.entries[value].user_value() + return enum.entries[value].user_value() def _get_scalar(self, attr_spec, value): try: @@ -750,8 +751,7 @@ class YnlFamily(SpecFamily): def _decode_unknown(self, attr): if attr.is_nest: return self._decode(NlAttrs(attr.raw), None) - else: - return attr.as_bin() + return attr.as_bin() def _rsp_add(self, rsp, name, is_multi, decoded): if is_multi is None: @@ -800,7 +800,8 @@ class YnlFamily(SpecFamily): # pylint: disable=too-many-statements def _decode(self, attrs, space, outer_attrs = None): - rsp = dict() + rsp = {} + search_attrs = {} if space: attr_space = self.attr_sets[space] search_attrs = SpaceAttrs(attr_space, rsp, outer_attrs) @@ -818,7 +819,9 @@ class YnlFamily(SpecFamily): try: if attr_spec["type"] == 'nest': - subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs) + subdict = self._decode(NlAttrs(attr.raw), + attr_spec['nested-attributes'], + search_attrs) decoded = subdict elif attr_spec["type"] == 'string': decoded = attr.as_strz() @@ -927,12 +930,11 @@ class YnlFamily(SpecFamily): format_ = NlAttr.get_format(m.type, m.byte_order) size += format_.size return size - else: - return 0 + return 0 def _decode_struct(self, data, name): members = self.consts[name].members - attrs = dict() + attrs = {} offset = 0 for m in members: value = None @@ -969,7 +971,7 @@ class YnlFamily(SpecFamily): elif m.type == 'binary': if m.struct: if value is None: - value = dict() + value = {} attr_payload += self._encode_struct(m.struct, value) else: if value is None: @@ -1026,7 +1028,7 @@ class YnlFamily(SpecFamily): return raw def handle_ntf(self, decoded): - msg = dict() + msg = {} if self.include_raw: msg['raw'] = decoded op = self.rsp_by_value[decoded.cmd()] @@ -1166,9 +1168,8 @@ class YnlFamily(SpecFamily): if decoded.cmd() in self.async_msg_ids: self.handle_ntf(decoded) continue - else: - print('Unexpected message: ' + repr(decoded)) - continue + print('Unexpected message: ' + repr(decoded)) + continue rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name) if op.fixed_header: -- cgit v1.2.3 From 542ba2de32fb7ad6b8c51a05a4f0d6ca3cc66d67 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:31 +0000 Subject: tools: ynl: fix pylint misc warnings Fix pylint warnings for: - unused-argument - consider-using-in - consider-using-get - consider-using-f-string - protected-access - unidiomatic-typecheck - no-else-return Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-6-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/lib/doc_generator.py | 3 +-- tools/net/ynl/pyynl/lib/nlspec.py | 5 ++--- tools/net/ynl/pyynl/lib/ynl.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/lib/doc_generator.py b/tools/net/ynl/pyynl/lib/doc_generator.py index 3a16b8eb01ca..d1afff9d9956 100644 --- a/tools/net/ynl/pyynl/lib/doc_generator.py +++ b/tools/net/ynl/pyynl/lib/doc_generator.py @@ -109,8 +109,7 @@ class RstFormatters: 'fixed-header': 'definition', 'nested-attributes': 'attribute-set', 'struct': 'definition'} - if prefix in mappings: - prefix = mappings[prefix] + prefix = mappings.get(prefix, prefix) return f":ref:`{namespace}-{prefix}-{name}`" def rst_header(self) -> str: diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index fcd4106d0cfa..f3173146b64b 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -105,8 +105,7 @@ class SpecEnumEntry(SpecElement): def user_value(self, as_flags=None): if self.enum_set['type'] == 'flags' or as_flags: return 1 << self.value - else: - return self.value + return self.value class SpecEnumSet(SpecElement): @@ -194,7 +193,7 @@ class SpecAttr(SpecElement): self.sub_message = yaml.get('sub-message') self.selector = yaml.get('selector') - self.is_auto_scalar = self.type == "sint" or self.type == "uint" + self.is_auto_scalar = self.type in ("sint", "uint") class SpecAttrSet(SpecElement): diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 040ff3b87c17..4bc8e58cb621 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -415,7 +415,7 @@ class NetlinkProtocol: nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0) return nlmsg - def message(self, flags, command, version, seq=None): + def message(self, flags, command, _version, seq=None): return self._message(command, flags, seq) def _decode(self, nl_msg): @@ -425,7 +425,7 @@ class NetlinkProtocol: msg = self._decode(nl_msg) if op is None: op = ynl.rsp_by_value[msg.cmd()] - fixed_header_size = ynl._struct_size(op.fixed_header) + fixed_header_size = ynl.struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -755,7 +755,7 @@ class YnlFamily(SpecFamily): def _rsp_add(self, rsp, name, is_multi, decoded): if is_multi is None: - if name in rsp and type(rsp[name]) is not list: + if name in rsp and not isinstance(rsp[name], list): rsp[name] = [rsp[name]] is_multi = True else: @@ -788,7 +788,7 @@ class YnlFamily(SpecFamily): offset = 0 if msg_format.fixed_header: decoded.update(self._decode_struct(attr.raw, msg_format.fixed_header)) - offset = self._struct_size(msg_format.fixed_header) + offset = self.struct_size(msg_format.fixed_header) if msg_format.attr_set: if msg_format.attr_set in self.attr_sets: subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set) @@ -908,7 +908,7 @@ class YnlFamily(SpecFamily): return msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op) - offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header) + offset = self.nlproto.msghdr_size() + self.struct_size(op.fixed_header) search_attrs = SpaceAttrs(op.attr_set, vals) path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset, extack['bad-attr-offs'], search_attrs) @@ -916,14 +916,14 @@ class YnlFamily(SpecFamily): del extack['bad-attr-offs'] extack['bad-attr'] = path - def _struct_size(self, name): + def struct_size(self, name): if name: members = self.consts[name].members size = 0 for m in members: if m.type in ['pad', 'binary']: if m.struct: - size += self._struct_size(m.struct) + size += self.struct_size(m.struct) else: size += m.len else: @@ -942,7 +942,7 @@ class YnlFamily(SpecFamily): offset += m.len elif m.type == 'binary': if m.struct: - len_ = self._struct_size(m.struct) + len_ = self.struct_size(m.struct) value = self._decode_struct(data[offset : offset + len_], m.struct) offset += len_ @@ -987,7 +987,7 @@ class YnlFamily(SpecFamily): def _formatted_string(self, raw, display_hint): if display_hint == 'mac': - formatted = ':'.join('%02x' % b for b in raw) + formatted = ':'.join(f'{b:02x}' for b in raw) elif display_hint == 'hex': if isinstance(raw, int): formatted = hex(raw) -- cgit v1.2.3 From 00ef9f153ed899e26382fc7918c1d087b20ef2c5 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:32 +0000 Subject: tools: ynl: fix pylint global variable related warnings Refactor to avoid using global variables to fix the following pylint issues: - invalid-name - global-statement - global-variable-not-assigned Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-7-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/lib/nlspec.py | 16 +++++++--------- tools/net/ynl/pyynl/lib/ynl.py | 24 ++++++++++-------------- 2 files changed, 17 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index f3173146b64b..0b5277082b38 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -13,10 +13,6 @@ import os import yaml as pyyaml -# To be loaded dynamically as needed -jsonschema = None - - class SpecException(Exception): """Netlink spec exception. """ @@ -439,6 +435,10 @@ class SpecFamily(SpecElement): mcast_groups dict of all multicast groups (index by name) kernel_family dict of kernel family attributes """ + + # To be loaded dynamically as needed + jsonschema = None + def __init__(self, spec_path, schema_path=None, exclude_ops=None): with open(spec_path, "r", encoding='utf-8') as stream: prefix = '# SPDX-License-Identifier: ' @@ -463,15 +463,13 @@ class SpecFamily(SpecElement): if schema_path is None: schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml' if schema_path: - global jsonschema - with open(schema_path, "r", encoding='utf-8') as stream: schema = pyyaml.safe_load(stream) - if jsonschema is None: - jsonschema = importlib.import_module("jsonschema") + if SpecFamily.jsonschema is None: + SpecFamily.jsonschema = importlib.import_module("jsonschema") - jsonschema.validate(self.yaml, schema) + SpecFamily.jsonschema.validate(self.yaml, schema) self.attr_sets = collections.OrderedDict() self.sub_msgs = collections.OrderedDict() diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 4bc8e58cb621..9774005e7ad1 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -320,9 +320,6 @@ class NlMsgs: yield from self.msgs -genl_family_name_to_id = None - - def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None): # we prepend length in _genl_msg_finalize() if seq is None: @@ -338,6 +335,8 @@ def _genl_msg_finalize(msg): # pylint: disable=too-many-nested-blocks def _genl_load_families(): + genl_family_name_to_id = {} + with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock: sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1) @@ -348,18 +347,14 @@ def _genl_load_families(): sock.send(msg, 0) - global genl_family_name_to_id - genl_family_name_to_id = dict() - while True: reply = sock.recv(128 * 1024) nms = NlMsgs(reply) for nl_msg in nms: if nl_msg.error: - print("Netlink error:", nl_msg.error) - return + raise YnlException(f"Netlink error: {nl_msg.error}") if nl_msg.done: - return + return genl_family_name_to_id gm = GenlMsg(nl_msg) fam = {} @@ -439,15 +434,16 @@ class NetlinkProtocol: class GenlProtocol(NetlinkProtocol): + genl_family_name_to_id = {} + def __init__(self, family_name): super().__init__(family_name, Netlink.NETLINK_GENERIC) - global genl_family_name_to_id - if genl_family_name_to_id is None: - _genl_load_families() + if not GenlProtocol.genl_family_name_to_id: + GenlProtocol.genl_family_name_to_id = _genl_load_families() - self.genl_family = genl_family_name_to_id[family_name] - self.family_id = genl_family_name_to_id[family_name]['id'] + self.genl_family = GenlProtocol.genl_family_name_to_id[family_name] + self.family_id = GenlProtocol.genl_family_name_to_id[family_name]['id'] def message(self, flags, command, version, seq=None): nlmsg = self._message(self.family_id, flags, seq) -- cgit v1.2.3 From 9b6b016df4c2902cd6acd2673bffea6c1e8f643d Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:33 +0000 Subject: tools: ynl: fix logic errors reported by pylint Fix the following logic errors: tools/net/ynl/pyynl/lib/nlspec.py:299:15: E1101: Instance of 'list' has no 'items' member (no-member) tools/net/ynl/pyynl/lib/nlspec.py:580:22: E0606: Possibly using variable 'op' before assignment (possibly-used-before-assignment) Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-8-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/lib/nlspec.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index 0b5277082b38..fcffeb5b7ba3 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -295,7 +295,7 @@ class SpecStruct(SpecElement): yield from self.members def items(self): - return self.members.items() + return self.members class SpecSubMessage(SpecElement): @@ -570,12 +570,11 @@ class SpecFamily(SpecElement): skip |= bool(exclude.match(elem['name'])) if not skip: op = self.new_operation(elem, req_val, rsp_val) + self.msgs[op.name] = op req_val = req_val_next rsp_val = rsp_val_next - self.msgs[op.name] = op - def find_operation(self, name): """ For a given operation name, find and return operation spec. -- cgit v1.2.3 From 301da4cfea5fef6dfc82a37031b84c89e06c5c7b Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:34 +0000 Subject: tools: ynl: ethtool: fix pylint issues Fix or suppress all the pylint issues in ethtool.py, except for TODO (fixme) items. Suppress: - too-many-locals - too-many-branches - too-many-statements - too-many-return-statements - import-error Fix: - missing-module-docstring - redefined-outer-name - dangerous-default-value - use-dict-literal - missing-function-docstring - global-variable-undefined - expression-not-assigned - inconsistent-return-statements - wrong-import-order Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-9-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ethtool.py | 46 ++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py index 40a8ba8d296f..f1a2a2a89985 100755 --- a/tools/net/ynl/pyynl/ethtool.py +++ b/tools/net/ynl/pyynl/ethtool.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# pylint: disable=too-many-locals, too-many-branches, too-many-statements +# pylint: disable=too-many-return-statements + +""" YNL ethtool utility """ import argparse import pathlib @@ -10,8 +15,10 @@ import os # pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) -from lib import YnlFamily +# pylint: disable=import-error from cli import schema_dir, spec_dir +from lib import YnlFamily + def args_to_req(ynl, op_name, args, req): """ @@ -49,7 +56,8 @@ def print_field(reply, *desc): return if len(desc) == 0: - return print_field(reply, *zip(reply.keys(), reply.keys())) + print_field(reply, *zip(reply.keys(), reply.keys())) + return for spec in desc: try: @@ -89,11 +97,12 @@ def doit(ynl, args, op_name): args_to_req(ynl, op_name, args.args, req) ynl.do(op_name, req) -def dumpit(ynl, args, op_name, extra = {}): +def dumpit(ynl, args, op_name, extra=None): """ Prepare request header, parse arguments and dumpit (filtering out the devices we're not interested in). """ + extra = extra or {} reply = ynl.dump(op_name, { 'header': {} } | extra) if not reply: return {} @@ -115,9 +124,9 @@ def bits_to_dict(attr): """ ret = {} if 'bits' not in attr: - return dict() + return {} if 'bit' not in attr['bits']: - return dict() + return {} for bit in attr['bits']['bit']: if bit['name'] == '': continue @@ -127,6 +136,8 @@ def bits_to_dict(attr): return ret def main(): + """ YNL ethtool utility """ + parser = argparse.ArgumentParser(description='ethtool wannabe') parser.add_argument('--json', action=argparse.BooleanOptionalAction) parser.add_argument('--show-priv-flags', action=argparse.BooleanOptionalAction) @@ -156,7 +167,7 @@ def main(): # TODO: rss-get parser.add_argument('device', metavar='device', type=str) parser.add_argument('args', metavar='args', type=str, nargs='*') - global args + args = parser.parse_args() spec = os.path.join(spec_dir(), 'ethtool.yaml') @@ -170,13 +181,16 @@ def main(): return if args.set_eee: - return doit(ynl, args, 'eee-set') + doit(ynl, args, 'eee-set') + return if args.set_pause: - return doit(ynl, args, 'pause-set') + doit(ynl, args, 'pause-set') + return if args.set_coalesce: - return doit(ynl, args, 'coalesce-set') + doit(ynl, args, 'coalesce-set') + return if args.set_features: # TODO: parse the bitmask @@ -184,10 +198,12 @@ def main(): return if args.set_channels: - return doit(ynl, args, 'channels-set') + doit(ynl, args, 'channels-set') + return if args.set_ring: - return doit(ynl, args, 'rings-set') + doit(ynl, args, 'rings-set') + return if args.show_priv_flags: flags = bits_to_dict(dumpit(ynl, args, 'privflags-get')['flags']) @@ -338,25 +354,25 @@ def main(): print(f'Time stamping parameters for {args.device}:') print('Capabilities:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])] + _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])] print(f'PTP Hardware Clock: {tsinfo.get("phc-index", "none")}') if 'tx-types' in tsinfo: print('Hardware Transmit Timestamp Modes:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])] + _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])] else: print('Hardware Transmit Timestamp Modes: none') if 'rx-filters' in tsinfo: print('Hardware Receive Filter Modes:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] else: print('Hardware Receive Filter Modes: none') if 'stats' in tsinfo and tsinfo['stats']: print('Statistics:') - [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] + _ = [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return -- cgit v1.2.3 From 9a130471f854aa2146e74a8dec47478d9e6d0654 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:35 +0000 Subject: tools: ynl: fix pylint issues in ynl_gen_rst Add a couple of pylint suppressions to ynl_gen_rst.py: - no-name-in-module,wrong-import-position - broad-exception-caught Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-10-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_rst.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/ynl_gen_rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py index 90ae19aac89d..30324e2fd682 100755 --- a/tools/net/ynl/pyynl/ynl_gen_rst.py +++ b/tools/net/ynl/pyynl/ynl_gen_rst.py @@ -19,6 +19,7 @@ import sys import argparse import logging +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlDocGenerator # pylint: disable=C0413 @@ -60,6 +61,7 @@ def write_to_rstfile(content: str, filename: str) -> None: rst_file.write(content) +# pylint: disable=broad-exception-caught def main() -> None: """Main function that reads the YAML files and generates the RST files""" -- cgit v1.2.3 From c2fa97c509ec5d0d9b3134132b90c117a961f2b6 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:36 +0000 Subject: tools: ynl-gen-c: suppress unhelpful pylint messages Disable pylint messages for too-many-*, too-few-*, docstrings, broad-exception-* and messages for specific code that won't get changed. Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-11-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index b517d0c605ad..14d16024fe11 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# pylint: disable=line-too-long, missing-class-docstring, missing-function-docstring +# pylint: disable=too-many-positional-arguments, too-many-arguments, too-many-statements +# pylint: disable=too-many-branches, too-many-locals, too-many-instance-attributes +# pylint: disable=too-many-nested-blocks, too-many-lines, too-few-public-methods +# pylint: disable=broad-exception-raised, broad-exception-caught, protected-access import argparse import filecmp @@ -11,6 +17,7 @@ import sys import tempfile import yaml +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation, SpecEnumSet, SpecEnumEntry from lib import SpecSubMessage @@ -183,6 +190,7 @@ class Type(SpecAttr): for line in lines: ri.cw.p(line) + # pylint: disable=assignment-from-none def arg_member(self, ri): member = self._complex_member_type(ri) if member: @@ -280,6 +288,7 @@ class Type(SpecAttr): code = [] presence = '' + # pylint: disable=consider-using-enumerate for i in range(0, len(ref)): presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}" # Every layer below last is a nest, so we know it uses bit presence @@ -414,6 +423,7 @@ class TypeScalar(Type): if low < -32768 or high > 32767: self.checks['full-range'] = True + # pylint: disable=too-many-return-statements def _attr_policy(self, policy): if 'flags-mask' in self.checks or self.is_bitfield: if self.is_bitfield: @@ -1650,6 +1660,7 @@ class CodeWriter: if out_file is None: self._out = os.sys.stdout else: + # pylint: disable=consider-using-with self._out = tempfile.NamedTemporaryFile('w+') self._out_file = out_file -- cgit v1.2.3 From 93ef84292959e14fac8aa49079ae951e0078dc6e Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:37 +0000 Subject: tools: ynl-gen-c: fix pylint warnings for returns, unused, redefined Fix the following pylint warnings: - unused-argument - unused-variable - no-else-return - inconsistent-return-statements - redefined-outer-name - unreachable Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-12-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 100 ++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 14d16024fe11..900896779e61 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -7,6 +7,12 @@ # pylint: disable=too-many-nested-blocks, too-many-lines, too-few-public-methods # pylint: disable=broad-exception-raised, broad-exception-caught, protected-access +""" +ynl_gen_c + +A YNL to C code generator for both kernel and userspace protocol stubs. +""" + import argparse import filecmp import pathlib @@ -15,7 +21,7 @@ import re import shutil import sys import tempfile -import yaml +import yaml as pyyaml # pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) @@ -164,7 +170,7 @@ class Type(SpecAttr): def presence_member(self, space, type_filter): if self.presence_type() != type_filter: - return + return '' if self.presence_type() == 'present': pfx = '__' if space == 'user' else '' @@ -173,14 +179,15 @@ class Type(SpecAttr): if self.presence_type() in {'len', 'count'}: pfx = '__' if space == 'user' else '' return f"{pfx}u32 {self.c_name};" + return '' - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return None def free_needs_iter(self): return False - def _free_lines(self, ri, var, ref): + def _free_lines(self, _ri, var, ref): if self.is_multi_val() or self.presence_type() in {'count', 'len'}: return [f'free({var}->{ref}{self.c_name});'] return [] @@ -278,7 +285,7 @@ class Type(SpecAttr): def _setter_lines(self, ri, member, presence): raise Exception(f"Setter not implemented for class type {self.type}") - def setter(self, ri, space, direction, deref=False, ref=None, var="req"): + def setter(self, ri, _space, direction, deref=False, ref=None, var="req"): ref = (ref if ref else []) + [self.c_name] member = f"{var}->{'.'.join(ref)}" @@ -434,15 +441,15 @@ class TypeScalar(Type): flag_cnt = len(flags['entries']) mask = (1 << flag_cnt) - 1 return f"NLA_POLICY_MASK({policy}, 0x{mask:x})" - elif 'full-range' in self.checks: + if 'full-range' in self.checks: return f"NLA_POLICY_FULL_RANGE({policy}, &{c_lower(self.enum_name)}_range)" - elif 'range' in self.checks: + if 'range' in self.checks: return f"NLA_POLICY_RANGE({policy}, {self.get_limit_str('min')}, {self.get_limit_str('max')})" - elif 'min' in self.checks: + if 'min' in self.checks: return f"NLA_POLICY_MIN({policy}, {self.get_limit_str('min')})" - elif 'max' in self.checks: + if 'max' in self.checks: return f"NLA_POLICY_MAX({policy}, {self.get_limit_str('max')})" - elif 'sparse' in self.checks: + if 'sparse' in self.checks: return f"NLA_POLICY_VALIDATE_FN({policy}, &{c_lower(self.enum_name)}_validate)" return super()._attr_policy(policy) @@ -637,7 +644,7 @@ class TypeBinaryScalarArray(TypeBinary): class TypeBitfield32(Type): - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return "struct nla_bitfield32" def _attr_typol(self): @@ -665,7 +672,7 @@ class TypeNest(Type): def is_recursive(self): return self.family.pure_nested_structs[self.nested_attrs].recursive - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return self.nested_struct_type def _free_lines(self, ri, var, ref): @@ -699,7 +706,7 @@ class TypeNest(Type): f"parg.data = &{var}->{self.c_name};"] return get_lines, init_lines, None - def setter(self, ri, space, direction, deref=False, ref=None, var="req"): + def setter(self, ri, _space, direction, deref=False, ref=None, var="req"): ref = (ref if ref else []) + [self.c_name] for _, attr in ri.family.pure_nested_structs[self.nested_attrs].member_list(): @@ -724,19 +731,18 @@ class TypeMultiAttr(Type): def _complex_member_type(self, ri): if 'type' not in self.attr or self.attr['type'] == 'nest': return self.nested_struct_type - elif self.attr['type'] == 'binary' and 'struct' in self.attr: + if self.attr['type'] == 'binary' and 'struct' in self.attr: return None # use arg_member() - elif self.attr['type'] == 'string': + if self.attr['type'] == 'string': return 'struct ynl_string *' - elif self.attr['type'] in scalars: + if self.attr['type'] in scalars: scalar_pfx = '__' if ri.ku_space == 'user' else '' if self.is_auto_scalar: name = self.type[0] + '64' else: name = self.attr['type'] return scalar_pfx + name - else: - raise Exception(f"Sub-type {self.attr['type']} not supported yet") + raise Exception(f"Sub-type {self.attr['type']} not supported yet") def arg_member(self, ri): if self.type == 'binary' and 'struct' in self.attr: @@ -747,7 +753,7 @@ class TypeMultiAttr(Type): def free_needs_iter(self): return self.attr['type'] in {'nest', 'string'} - def _free_lines(self, ri, var, ref): + def _free_lines(self, _ri, var, ref): lines = [] if self.attr['type'] in scalars: lines += [f"free({var}->{ref}{self.c_name});"] @@ -811,13 +817,12 @@ class TypeIndexedArray(Type): def _complex_member_type(self, ri): if 'sub-type' not in self.attr or self.attr['sub-type'] == 'nest': return self.nested_struct_type - elif self.attr['sub-type'] in scalars: + if self.attr['sub-type'] in scalars: scalar_pfx = '__' if ri.ku_space == 'user' else '' return scalar_pfx + self.attr['sub-type'] - elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: + if self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: return None # use arg_member() - else: - raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet") + raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet") def arg_member(self, ri): if self.sub_type == 'binary' and 'exact-len' in self.checks: @@ -833,12 +838,11 @@ class TypeIndexedArray(Type): def _attr_typol(self): if self.attr['sub-type'] in scalars: return f'.type = YNL_PT_U{c_upper(self.sub_type[1:])}, ' - elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: + if self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: return f'.type = YNL_PT_BINARY, .len = {self.checks["exact-len"]}, ' - elif self.attr['sub-type'] == 'nest': + if self.attr['sub-type'] == 'nest': return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, ' - else: - raise Exception(f"Typol for IndexedArray sub-type {self.attr['sub-type']} not supported, yet") + raise Exception(f"Typol for IndexedArray sub-type {self.attr['sub-type']} not supported, yet") def _attr_get(self, ri, var): local_vars = ['const struct nlattr *attr2;'] @@ -874,7 +878,7 @@ class TypeIndexedArray(Type): def free_needs_iter(self): return self.sub_type == 'nest' - def _free_lines(self, ri, var, ref): + def _free_lines(self, _ri, var, ref): lines = [] if self.sub_type == 'nest': lines += [ @@ -885,7 +889,7 @@ class TypeIndexedArray(Type): return lines class TypeNestTypeValue(Type): - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return self.nested_struct_type def _attr_typol(self): @@ -1030,7 +1034,7 @@ class Struct: def external_selectors(self): sels = [] - for name, attr in self.attr_list: + for _name, attr in self.attr_list: if isinstance(attr, TypeSubMessage) and attr.selector.is_external(): sels.append(attr.selector) return sels @@ -1047,9 +1051,9 @@ class EnumEntry(SpecEnumEntry): super().__init__(enum_set, yaml, prev, value_start) if prev: - self.value_change = (self.value != prev.value + 1) + self.value_change = self.value != prev.value + 1 else: - self.value_change = (self.value != 0) + self.value_change = self.value != 0 self.value_change = self.value_change or self.enum_set['type'] == 'flags' # Added by resolve: @@ -1321,7 +1325,7 @@ class Family(SpecFamily): } def _load_root_sets(self): - for op_name, op in self.msgs.items(): + for _op_name, op in self.msgs.items(): if 'attribute-set' not in op: continue @@ -1520,7 +1524,7 @@ class Family(SpecFamily): for k, _ in self.root_sets.items(): yield k, None # we don't have a struct, but it must be terminal - for attr_set, struct in all_structs(): + for attr_set, _struct in all_structs(): for _, spec in self.attr_sets[attr_set].items(): if 'nested-attributes' in spec: child_name = spec['nested-attributes'] @@ -1540,7 +1544,7 @@ class Family(SpecFamily): def _load_global_policy(self): global_set = set() attr_set_name = None - for op_name, op in self.ops.items(): + for _op_name, op in self.ops.items(): if not op: continue if 'attribute-set' not in op: @@ -2049,12 +2053,12 @@ def put_op_name(family, cw): _put_enum_to_str_helper(cw, family.c_name + '_op', map_name, 'op') -def put_enum_to_str_fwd(family, cw, enum): +def put_enum_to_str_fwd(_family, cw, enum): args = [enum.user_type + ' value'] cw.write_func_prot('const char *', f'{enum.render_name}_str', args, suffix=';') -def put_enum_to_str(family, cw, enum): +def put_enum_to_str(_family, cw, enum): map_name = f'{enum.render_name}_strmap' cw.block_start(line=f"static const char * const {map_name}[] =") for entry in enum.entries.values(): @@ -2335,7 +2339,8 @@ def parse_rsp_nested_prototype(ri, struct, suffix=';'): def parse_rsp_nested(ri, struct): if struct.submsg: - return parse_rsp_submsg(ri, struct) + parse_rsp_submsg(ri, struct) + return parse_rsp_nested_prototype(ri, struct, suffix='') @@ -2715,7 +2720,7 @@ def _free_type(ri, direction, struct): def free_rsp_nested_prototype(ri): - print_free_prototype(ri, "") + print_free_prototype(ri, "") def free_rsp_nested(ri, struct): @@ -3357,7 +3362,7 @@ def render_user_family(family, cw, prototype): else: raise Exception('Invalid notification ' + ntf_op_name) _render_user_ntf_entry(ri, ntf_op) - for op_name, op in family.ops.items(): + for _op_name, op in family.ops.items(): if 'event' not in op: continue ri = RenderInfo(cw, family, "user", op, "event") @@ -3429,10 +3434,9 @@ def main(): print('Spec license:', parsed.license) print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)') os.sys.exit(1) - except yaml.YAMLError as exc: + except pyyaml.YAMLError as exc: print(exc) os.sys.exit(1) - return cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out)) @@ -3535,7 +3539,7 @@ def main(): cw.nl() if parsed.kernel_policy in {'per-op', 'split'}: - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): if 'do' in op and 'event' not in op: ri = RenderInfo(cw, parsed, args.mode, op, "do") print_req_policy_fwd(cw, ri.struct['request'], ri=ri) @@ -3564,7 +3568,7 @@ def main(): print_req_policy(cw, struct) cw.nl() - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): if parsed.kernel_policy in {'per-op', 'split'}: for op_mode in ['do', 'dump']: if op_mode in op and 'request' in op[op_mode]: @@ -3592,7 +3596,7 @@ def main(): ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set) print_type_full(ri, struct) - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): cw.p(f"/* ============== {op.enum_name} ============== */") if 'do' in op and 'event' not in op: @@ -3625,7 +3629,7 @@ def main(): raise Exception(f'Only notifications with consistent types supported ({op.name})') print_wrapped_type(ri) - for op_name, op in parsed.ntfs.items(): + for _op_name, op in parsed.ntfs.items(): if 'event' in op: ri = RenderInfo(cw, parsed, args.mode, op, 'event') cw.p(f"/* {op.enum_name} - event */") @@ -3675,7 +3679,7 @@ def main(): if struct.reply: parse_rsp_nested(ri, struct) - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): cw.p(f"/* ============== {op.enum_name} ============== */") if 'do' in op and 'event' not in op: cw.p(f"/* {op.enum_name} - do */") @@ -3703,7 +3707,7 @@ def main(): raise Exception(f'Only notifications with consistent types supported ({op.name})') print_ntf_type_free(ri) - for op_name, op in parsed.ntfs.items(): + for _op_name, op in parsed.ntfs.items(): if 'event' in op: cw.p(f"/* {op.enum_name} - event */") -- cgit v1.2.3 From a587f592d6c49903457c1d06876ddb071907850d Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:38 +0000 Subject: tools: ynl-gen-c: fix pylint None, type, dict, generators, init Fix the following pylint warnings that are trivial one-liners: - unsubscriptable-object - unidiomatic-typecheck - use-dict-literal - attribute-defined-outside-init - consider-using-in - consider-using-generator Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-13-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 49 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 900896779e61..5f079a74c8d1 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -200,7 +200,7 @@ class Type(SpecAttr): # pylint: disable=assignment-from-none def arg_member(self, ri): member = self._complex_member_type(ri) - if member: + if member is not None: spc = ' ' if member[-1] != '*' else '' arg = [member + spc + '*' + self.c_name] if self.presence_type() == 'count': @@ -210,7 +210,7 @@ class Type(SpecAttr): def struct_member(self, ri): member = self._complex_member_type(ri) - if member: + if member is not None: ptr = '*' if self.is_multi_val() else '' if self.is_recursive_for_op(ri): ptr = '*' @@ -258,9 +258,9 @@ class Type(SpecAttr): def attr_get(self, ri, var, first): lines, init_lines, _ = self._attr_get(ri, var) - if type(lines) is str: + if isinstance(lines, str): lines = [lines] - if type(init_lines) is str: + if isinstance(init_lines, str): init_lines = [init_lines] kw = 'if' if first else 'else if' @@ -1002,7 +1002,7 @@ class Struct: self.in_multi_val = False # used by a MultiAttr or and legacy arrays self.attr_list = [] - self.attrs = dict() + self.attrs = {} if type_list is not None: for t in type_list: self.attr_list.append((t, self.attr_set[t]),) @@ -1094,8 +1094,8 @@ class EnumSet(SpecEnumSet): return EnumEntry(self, entry, prev_entry, value_start) def value_range(self): - low = min([x.value for x in self.entries.values()]) - high = max([x.value for x in self.entries.values()]) + low = min(x.value for x in self.entries.values()) + high = max(x.value for x in self.entries.values()) if high - low + 1 != len(self.entries): return None, None @@ -1234,6 +1234,12 @@ class Family(SpecFamily): self.hooks = None delattr(self, "hooks") + self.root_sets = {} + self.pure_nested_structs = {} + self.kernel_policy = None + self.global_policy = None + self.global_policy_set = None + super().__init__(file_name, exclude_ops=exclude_ops) self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME')) @@ -1268,18 +1274,18 @@ class Family(SpecFamily): self.mcgrps = self.yaml.get('mcast-groups', {'list': []}) - self.hooks = dict() + self.hooks = {} for when in ['pre', 'post']: - self.hooks[when] = dict() + self.hooks[when] = {} for op_mode in ['do', 'dump']: - self.hooks[when][op_mode] = dict() + self.hooks[when][op_mode] = {} self.hooks[when][op_mode]['set'] = set() self.hooks[when][op_mode]['list'] = [] # dict space-name -> 'request': set(attrs), 'reply': set(attrs) - self.root_sets = dict() + self.root_sets = {} # dict space-name -> Struct - self.pure_nested_structs = dict() + self.pure_nested_structs = {} self._mark_notify() self._mock_up_events() @@ -1627,7 +1633,7 @@ class RenderInfo: self.cw = cw - self.struct = dict() + self.struct = {} if op_mode == 'notify': op_mode = 'do' if 'do' in op else 'dump' for op_dir in ['request', 'reply']: @@ -1794,7 +1800,7 @@ class CodeWriter: if not local_vars: return - if type(local_vars) is str: + if isinstance(local_vars, str): local_vars = [local_vars] local_vars.sort(key=len, reverse=True) @@ -1814,20 +1820,19 @@ class CodeWriter: def writes_defines(self, defines): longest = 0 for define in defines: - if len(define[0]) > longest: - longest = len(define[0]) + longest = max(len(define[0]), longest) longest = ((longest + 8) // 8) * 8 for define in defines: line = '#define ' + define[0] line += '\t' * ((longest - len(define[0]) + 7) // 8) - if type(define[1]) is int: + if isinstance(define[1], int): line += str(define[1]) - elif type(define[1]) is str: + elif isinstance(define[1], str): line += '"' + define[1] + '"' self.p(line) def write_struct_init(self, members): - longest = max([len(x[0]) for x in members]) + longest = max(len(x[0]) for x in members) longest += 1 # because we prepend a . longest = ((longest + 8) // 8) * 8 for one in members: @@ -2670,7 +2675,7 @@ def print_req_free(ri): def print_rsp_type(ri): - if (ri.op_mode == 'do' or ri.op_mode == 'dump') and 'reply' in ri.op[ri.op_mode]: + if ri.op_mode in ('do', 'dump') and 'reply' in ri.op[ri.op_mode]: direction = 'reply' elif ri.op_mode == 'event': direction = 'reply' @@ -2683,7 +2688,7 @@ def print_wrapped_type(ri): ri.cw.block_start(line=f"{type_name(ri, 'reply')}") if ri.op_mode == 'dump': ri.cw.p(f"{type_name(ri, 'reply')} *next;") - elif ri.op_mode == 'notify' or ri.op_mode == 'event': + elif ri.op_mode in ('notify', 'event'): ri.cw.p('__u16 family;') ri.cw.p('__u8 cmd;') ri.cw.p('struct ynl_ntf_base_type *next;') @@ -2946,7 +2951,7 @@ def print_kernel_op_table_hdr(family, cw): def print_kernel_op_table(family, cw): print_kernel_op_table_fwd(family, cw, terminate=False) - if family.kernel_policy == 'global' or family.kernel_policy == 'per-op': + if family.kernel_policy in ('global', 'per-op'): for op_name, op in family.ops.items(): if op.is_async: continue -- cgit v1.2.3 From 1ecc8ae876c41befc4d4f4f85c7abd42387d06e0 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 8 Jan 2026 16:13:39 +0000 Subject: tools: ynl-gen-c: Fix remaining pylint warnings Fix the following pylint warning instances: ynl_gen_c.py:575:15: E0606: Possibly using variable 'mem' before assignment (possibly-used-before-assignment) ynl_gen_c.py:888:0: R1707: Disallow trailing comma tuple (trailing-comma-tuple) ynl_gen_c.py:944:21: C0209: Formatting a regular string which could be an f-string (consider-using-f-string) ynl_gen_c.py:1450:14: C1802: Do not use `len(SEQUENCE)` without comparison to determine if a sequence is empty (use-implicit-booleaness-not-len) ynl_gen_c.py:1688:13: W1514: Using open without explicitly specifying an encoding (unspecified-encoding) ynl_gen_c.py:3446:0: C0325: Unnecessary parens after '=' keyword (superfluous-parens) Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260108161339.29166-14-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 5f079a74c8d1..0e1e486c1185 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -571,6 +571,8 @@ class TypeBinary(Type): mem = 'NLA_POLICY_MIN_LEN(' + self.get_limit_str('min-len') + ')' elif 'max-len' in self.checks: mem = 'NLA_POLICY_MAX_LEN(' + self.get_limit_str('max-len') + ')' + else: + raise Exception('Failed to process policy check for binary type') return mem @@ -885,7 +887,7 @@ class TypeIndexedArray(Type): f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)", f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);', ] - lines += f"free({var}->{ref}{self.c_name});", + lines += (f"free({var}->{ref}{self.c_name});",) return lines class TypeNestTypeValue(Type): @@ -935,15 +937,15 @@ class TypeSubMessage(TypeNest): return typol def _attr_get(self, ri, var): - sel = c_lower(self['selector']) + selector = self['selector'] + sel = c_lower(selector) if self.selector.is_external(): sel_var = f"_sel_{sel}" else: sel_var = f"{var}->{sel}" get_lines = [f'if (!{sel_var})', - 'return ynl_submsg_failed(yarg, "%s", "%s");' % - (self.name, self['selector']), - f"if ({self.nested_render_name}_parse(&parg, {sel_var}, attr))", + f'return ynl_submsg_failed(yarg, "{self.name}", "{selector}");', + f"if ({self.nested_render_name}_parse(&parg, {sel_var}, attr))", "return YNL_PARSE_CB_ERROR;"] init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;", f"parg.data = &{var}->{self.c_name};"] @@ -1447,7 +1449,7 @@ class Family(SpecFamily): attr_set_queue = list(self.root_sets.keys()) attr_set_seen = set(self.root_sets.keys()) - while len(attr_set_queue): + while attr_set_queue: a_set = attr_set_queue.pop(0) for attr, spec in self.attr_sets[a_set].items(): if 'nested-attributes' in spec: @@ -1685,7 +1687,7 @@ class CodeWriter: if not self._overwrite and os.path.isfile(self._out_file): if filecmp.cmp(self._out.name, self._out_file, shallow=False): return - with open(self._out_file, 'w+') as out_file: + with open(self._out_file, 'w+', encoding='utf-8') as out_file: self._out.seek(0) shutil.copyfileobj(self._out, out_file) self._out.close() @@ -3443,7 +3445,7 @@ def main(): print(exc) os.sys.exit(1) - cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out)) + cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=not args.cmp_out) _, spec_kernel = find_kernel_root(args.spec) if args.mode == 'uapi' or args.header: -- cgit v1.2.3 From 4effccde0a0521b220c3585c9a0d8e677d345209 Mon Sep 17 00:00:00 2001 From: WanLi Niu Date: Tue, 6 Jan 2026 10:31:23 +0800 Subject: bpftool: Make skeleton C++ compatible with explicit casts Fix C++ compilation errors in generated skeleton by adding explicit pointer casts and use char * subtraction for offset calculation error: invalid conversion from 'void*' to '*' [-fpermissive] | skel = skel_alloc(sizeof(*skel)); | ~~~~~~~~~~^~~~~~~~~~~~~~~ | | | void* error: arithmetic on pointers to void | skel->ctx.sz = (void *)&skel->links - (void *)skel; | ~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~ error: assigning to 'struct __ *' from incompatible type 'void *' | skel-> = skel_prep_map_data((void *)data, 4096, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | sizeof(data) - 1); | ~~~~~~~~~~~~~~~~~ error: assigning to 'struct __ *' from incompatible type 'void *' | skel-> = skel_finalize_map_data(&skel->maps..initial_value, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 4096, PROT_READ | PROT_WRITE, skel->maps..map_fd); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Minimum reproducer: $ cat test.bpf.c int val; // placed in .bss section #include "vmlinux.h" #include SEC("raw_tracepoint/sched_wakeup_new") int handle(void *ctx) { return 0; } $ cat test.cpp #include extern "C" { #include "test.bpf.skel.h" } $ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h $ clang -g -O2 -target bpf -c test.bpf.c -o test.bpf.o $ bpftool gen skeleton test.bpf.o -L > test.bpf.skel.h $ g++ -c test.cpp -I. Co-developed-by: Menglong Dong Signed-off-by: WanLi Niu Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260106023123.2928-1-kiraskyler@163.com --- tools/bpf/bpftool/gen.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 993c7d9484a4..2f9e10752e28 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -731,10 +731,10 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h { \n\ struct %1$s *skel; \n\ \n\ - skel = skel_alloc(sizeof(*skel)); \n\ + skel = (struct %1$s *)skel_alloc(sizeof(*skel)); \n\ if (!skel) \n\ goto cleanup; \n\ - skel->ctx.sz = (void *)&skel->links - (void *)skel; \n\ + skel->ctx.sz = (char *)&skel->links - (char *)skel; \n\ ", obj_name, opts.data_sz); bpf_object__for_each_map(map, obj) { @@ -755,7 +755,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h \n\ \"; \n\ \n\ - skel->%1$s = skel_prep_map_data((void *)data, %2$zd,\n\ + skel->%1$s = (__typeof__(skel->%1$s))skel_prep_map_data((void *)data, %2$zd,\n\ sizeof(data) - 1);\n\ if (!skel->%1$s) \n\ goto cleanup; \n\ @@ -857,7 +857,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h codegen("\ \n\ - skel->%1$s = skel_finalize_map_data(&skel->maps.%1$s.initial_value, \n\ + skel->%1$s = (__typeof__(skel->%1$s))skel_finalize_map_data(&skel->maps.%1$s.initial_value,\n\ %2$zd, %3$s, skel->maps.%1$s.map_fd);\n\ if (!skel->%1$s) \n\ return -ENOMEM; \n\ -- cgit v1.2.3 From 671ef08d9455f5754d1fc96f5a14e357d6b80936 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:53 +0800 Subject: selftests/resctrl: Fix a division by zero error on Hygon Change to adjust effective L3 cache size with SNC enabled change introduced the snc_nodes_per_l3_cache() function to detect the Intel Sub-NUMA Clustering (SNC) feature by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0. The function was designed to return: (1) >1: SNC mode is enabled. (2) 1: SNC mode is not enabled or not supported. However, on certain Hygon CPUs, #CPUs sharing LLC with CPU0 is actually less than #CPUs in node0. This results in snc_nodes_per_l3_cache() returning 0 (calculated as cache_cpus / node_cpus). This leads to a division by zero error in get_cache_size(): *cache_size /= snc_nodes_per_l3_cache(); Causing the resctrl selftest to fail with: "Floating point exception (core dumped)" Fix the issue by ensuring snc_nodes_per_l3_cache() returns 1 when SNC mode is not supported on the platform. Updated commit log to fix commit has issues: Shuah Khan Link: https://lore.kernel.org/r/20251217030456.3834956-2-shenxiaochen@open-hieco.net Fixes: a1cd99e700ec ("selftests/resctrl: Adjust effective L3 cache size with SNC enabled") Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrlfs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 195f04c4d158..b9c1bfb6cc02 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -243,6 +243,16 @@ int snc_nodes_per_l3_cache(void) } snc_mode = cache_cpus / node_cpus; + /* + * On some platforms (e.g. Hygon), + * cache_cpus < node_cpus, the calculated snc_mode is 0. + * + * Set snc_mode = 1 to indicate that SNC mode is not + * supported on the platform. + */ + if (!snc_mode) + snc_mode = 1; + if (snc_mode > 1) ksft_print_msg("SNC-%d mode discovered.\n", snc_mode); } -- cgit v1.2.3 From 4f4f01cc333e97b0e63b61ed1a65c928aa662f99 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:54 +0800 Subject: selftests/resctrl: Define CPU vendor IDs as bits to match usage The CPU vendor IDs are required to be unique bits because they're used for vendor_specific bitmask in the struct resctrl_test. Consider for example their usage in test_vendor_specific_check(): return get_vendor() & test->vendor_specific However, the definitions of CPU vendor IDs in file resctrl.h is quite subtle as a bitmask value: #define ARCH_INTEL 1 #define ARCH_AMD 2 A clearer and more maintainable approach is to define these CPU vendor IDs using BIT(). This ensures each vendor corresponds to a distinct bit and makes it obvious when adding new vendor IDs. Accordingly, update the return types of detect_vendor() and get_vendor() from 'int' to 'unsigned int' to align with their usage as bitmask values and to prevent potentially risky type conversions. Furthermore, introduce a bool flag 'initialized' to simplify the get_vendor() -> detect_vendor() logic. This ensures the vendor ID is detected only once and resolves the ambiguity of using the same variable 'vendor' both as a value and as a state. Link: https://lore.kernel.org/r/20251217030456.3834956-3-shenxiaochen@open-hieco.net Suggested-by: Reinette Chatre Suggested-by: Fenghua Yu Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 7 ++++--- tools/testing/selftests/resctrl/resctrl_tests.c | 26 +++++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 3c51bdac2dfa..4f9c7d04c98d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "kselftest.h" #define MB (1024 * 1024) @@ -36,8 +37,8 @@ * Define as bits because they're used for vendor_specific bitmask in * the struct resctrl_test. */ -#define ARCH_INTEL 1 -#define ARCH_AMD 2 +#define ARCH_INTEL BIT(0) +#define ARCH_AMD BIT(1) #define END_OF_TESTS 1 @@ -163,7 +164,7 @@ extern int snc_unreliable; extern char llc_occup_path[1024]; int snc_nodes_per_l3_cache(void); -int get_vendor(void); +unsigned int get_vendor(void); bool check_resctrlfs_support(void); int filter_dmesg(void); int get_domain_id(const char *resource, int cpu_no, int *domain_id); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 5154ffd821c4..42605e2a3b66 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -23,16 +23,24 @@ static struct resctrl_test *resctrl_tests[] = { &l2_noncont_cat_test, }; -static int detect_vendor(void) +static unsigned int detect_vendor(void) { - FILE *inf = fopen("/proc/cpuinfo", "r"); - int vendor_id = 0; + static unsigned int vendor_id; + static bool initialized; char *s = NULL; + FILE *inf; char *res; - if (!inf) + if (initialized) return vendor_id; + inf = fopen("/proc/cpuinfo", "r"); + if (!inf) { + vendor_id = 0; + initialized = true; + return vendor_id; + } + res = fgrep(inf, "vendor_id"); if (res) @@ -45,15 +53,17 @@ static int detect_vendor(void) fclose(inf); free(res); + + initialized = true; return vendor_id; } -int get_vendor(void) +unsigned int get_vendor(void) { - static int vendor = -1; + unsigned int vendor; + + vendor = detect_vendor(); - if (vendor == -1) - vendor = detect_vendor(); if (vendor == 0) ksft_print_msg("Can not get vendor info...\n"); -- cgit v1.2.3 From 367f931e6476747edbde4e7c7b95fc5d5b724934 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:55 +0800 Subject: selftests/resctrl: Add CPU vendor detection for Hygon The resctrl selftest currently fails on Hygon CPUs that support Platform QoS features, printing the error: "# Can not get vendor info..." This occurs because vendor detection is missing for Hygon CPUs. Fix this by extending the CPU vendor detection logic to include Hygon's vendor ID. Link: https://lore.kernel.org/r/20251217030456.3834956-4-shenxiaochen@open-hieco.net Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 1 + tools/testing/selftests/resctrl/resctrl_tests.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 4f9c7d04c98d..afe635b6e48d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -39,6 +39,7 @@ */ #define ARCH_INTEL BIT(0) #define ARCH_AMD BIT(1) +#define ARCH_HYGON BIT(2) #define END_OF_TESTS 1 diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 42605e2a3b66..dbcd5eea9fbc 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -50,6 +50,8 @@ static unsigned int detect_vendor(void) vendor_id = ARCH_INTEL; else if (s && !strcmp(s, ": AuthenticAMD\n")) vendor_id = ARCH_AMD; + else if (s && !strcmp(s, ": HygonGenuine\n")) + vendor_id = ARCH_HYGON; fclose(inf); free(res); -- cgit v1.2.3 From 86063a2568b8f2eeb68da1411b320c0ff778f852 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:56 +0800 Subject: selftests/resctrl: Fix non-contiguous CBM check for Hygon The resctrl selftest currently fails on Hygon CPUs that always supports non-contiguous CBM, printing the error: "# Hardware and kernel differ on non-contiguous CBM support!" This occurs because the arch_supports_noncont_cat() function lacks vendor detection for Hygon CPUs, preventing proper identification of their non-contiguous CBM capability. Fix this by adding Hygon vendor ID detection to arch_supports_noncont_cat(). Link: https://lore.kernel.org/r/20251217030456.3834956-5-shenxiaochen@open-hieco.net Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 94cfdba5308d..f00b622c1460 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -290,8 +290,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param static bool arch_supports_noncont_cat(const struct resctrl_test *test) { - /* AMD always supports non-contiguous CBM. */ - if (get_vendor() == ARCH_AMD) + unsigned int vendor_id = get_vendor(); + + /* AMD and Hygon always support non-contiguous CBM. */ + if (vendor_id == ARCH_AMD || vendor_id == ARCH_HYGON) return true; #if defined(__i386__) || defined(__x86_64__) /* arch */ -- cgit v1.2.3 From 5714ca8cba5ed736f3733663c446cbee63a10a64 Mon Sep 17 00:00:00 2001 From: Varun R Mallya Date: Wed, 7 Jan 2026 05:05:27 +0530 Subject: libbpf: Fix OOB read in btf_dump_get_bitfield_value When dumping bitfield data, btf_dump_get_bitfield_value() reads data based on the underlying type's size (t->size). However, it does not verify that the provided data buffer (data_sz) is large enough to contain these bytes. If btf_dump__dump_type_data() is called with a buffer smaller than the type's size, this leads to an out-of-bounds read. This was confirmed by AddressSanitizer in the linked issue. Fix this by ensuring we do not read past the provided data_sz limit. Fixes: a1d3cc3c5eca ("libbpf: Avoid use of __int128 in typed dump display") Reported-by: Harrison Green Suggested-by: Alan Maguire Signed-off-by: Varun R Mallya Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260106233527.163487-1-varunrmallya@gmail.com Closes: https://github.com/libbpf/libbpf/issues/928 --- tools/lib/bpf/btf_dump.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 6388392f49a0..53c6624161d7 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1762,9 +1762,18 @@ static int btf_dump_get_bitfield_value(struct btf_dump *d, __u16 left_shift_bits, right_shift_bits; const __u8 *bytes = data; __u8 nr_copy_bits; + __u8 start_bit, nr_bytes; __u64 num = 0; int i; + /* Calculate how many bytes cover the bitfield */ + start_bit = bits_offset % 8; + nr_bytes = (start_bit + bit_sz + 7) / 8; + + /* Bound check */ + if (data + nr_bytes > d->typed_dump->data_end) + return -E2BIG; + /* Maximum supported bitfield size is 64 bits */ if (t->size > 8) { pr_warn("unexpected bitfield size %d\n", t->size); -- cgit v1.2.3 From 96ea4fa60c4528d95bdbce7f4212c015ab3e8113 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Jan 2026 12:02:05 -0800 Subject: selftests: tls: avoid flakiness in data_steal We see the following failure a few times a week: # RUN global.data_steal ... # tls.c:3280:data_steal:Expected recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT) (10000) == -1 (-1) # data_steal: Test failed # FAIL global.data_steal not ok 8 global.data_steal The 10000 bytes read suggests that the child process did a recv() of half of the data using the TLS ULP and we're now getting the remaining half. The intent of the test is to get the child to enter _TCP_ recvmsg handler, so it needs to enter the syscall before parent installed the TLS recvmsg with setsockopt(SOL_TLS). Instead of the 10msec sleep send 1 byte of data and wait for the child to consume it. Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260106200205.1593915-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index a4d16a460fbe..9e2ccea13d70 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -3260,17 +3260,25 @@ TEST(data_steal) { ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0); /* Spawn a child and get it into the read wait path of the underlying - * TCP socket. + * TCP socket (before kernel .recvmsg is replaced with the TLS one). */ pid = fork(); ASSERT_GE(pid, 0); if (!pid) { - EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), - sizeof(buf) / 2); + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2 + 1, MSG_WAITALL), + sizeof(buf) / 2 + 1); exit(!__test_passed(_metadata)); } - usleep(10000); + /* Send a sync byte and poll until it's consumed to ensure + * the child is in recv() before we proceed to install TLS. + */ + ASSERT_EQ(send(fd, buf, 1, 0), 1); + do { + usleep(500); + } while (recv(cfd, buf, 1, MSG_PEEK | MSG_DONTWAIT) == 1); + EXPECT_EQ(errno, EAGAIN); + ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); -- cgit v1.2.3 From a0ac0ff382767a5dbde266fb5f7997a5d6b70e10 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Jan 2026 15:25:57 -0800 Subject: selftests: drv-net: gro: increase the rcvbuf size The gro.py test (testing software GRO) is slightly flaky when running against fbnic. We see one flake per roughly 20 runs in NIPA, mostly in ipip.large, and always including some EAGAIN: # Shouldn't coalesce if exceed IP max pkt size: Test succeeded # Expected {65475 899 }, Total 2 packets # Received {65475 899 }, Total 2 packets. # Expected {64576 900 900 }, Total 3 packets # Received {64576 /home/virtme/testing/wt-24/tools/testing/selftests/drivers/net/gro: could not receive: Resource temporarily unavailable The test sends 2 large frames (64k + change). Looks like the default packet socket rcvbuf (~200kB) may not be large enough to hold them. Bump the rcvbuf to 1MB. Add a debug print showing socket statistics to make debugging this issue easier in the future. Without the rcvbuf increase we see: # Shouldn't coalesce if exceed IP max pkt size: Test succeeded # Expected {65475 899 }, Total 2 packets # Received {65475 899 }, Total 2 packets. # Expected {64576 900 900 }, Total 3 packets # Received {64576 Socket stats: packets=7, drops=3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # /home/virtme/testing/wt-24/tools/testing/selftests/drivers/net/gro: could not receive: Resource temporarily unavailable Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260107232557.2147760-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c index e894037d2e3e..751a8103f408 100644 --- a/tools/testing/selftests/drivers/net/gro.c +++ b/tools/testing/selftests/drivers/net/gro.c @@ -926,6 +926,28 @@ static void set_timeout(int fd) error(1, errno, "cannot set timeout, setsockopt failed"); } +static void set_rcvbuf(int fd) +{ + int bufsize = 1 * 1024 * 1024; /* 1 MB */ + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bufsize, sizeof(bufsize))) + error(1, errno, "cannot set rcvbuf size, setsockopt failed"); +} + +static void recv_error(int fd, int rcv_errno) +{ + struct tpacket_stats stats; + socklen_t len; + + len = sizeof(stats); + if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len)) + error(1, errno, "can't get stats"); + + fprintf(stderr, "Socket stats: packets=%u, drops=%u\n", + stats.tp_packets, stats.tp_drops); + error(1, rcv_errno, "could not receive"); +} + static void check_recv_pkts(int fd, int *correct_payload, int correct_num_pkts) { @@ -950,7 +972,7 @@ static void check_recv_pkts(int fd, int *correct_payload, ip_ext_len = 0; pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); if (pkt_size < 0) - error(1, errno, "could not receive"); + recv_error(fd, errno); if (iph->version == 4) ip_ext_len = (iph->ihl - 5) * 4; @@ -1126,6 +1148,7 @@ static void gro_receiver(void) error(1, 0, "socket creation"); setup_sock_filter(rxfd); set_timeout(rxfd); + set_rcvbuf(rxfd); bind_packetsocket(rxfd); ksft_ready(); -- cgit v1.2.3 From 68ec2b9fc59e8053f17ee1d1a5b1959c43a19202 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Jan 2026 06:53:19 -0800 Subject: selftests: forwarding: update PTP tcpdump patterns Recent version of tcpdump (tcpdump-4.99.6-1.fc43.x86_64) seems to have removed the spurious space after msg type in PTP info, e.g.: before: PTPv2, majorSdoId: 0x0, msg type : sync msg, length: 44 after: PTPv2, majorSdoId: 0x0, msg type: sync msg, length: 44 Update our patterns to match both. Reviewed-by: Alexander Sverdlin Link: https://patch.msgid.link/20260107145320.1837464-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/local_termination.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 892895659c7e..1f2bf6e81847 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -306,39 +306,39 @@ run_test() if [ $skip_ptp = false ]; then check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" fi -- cgit v1.2.3 From a1025dcd377ef92d9a09af03b70ce80be281ee22 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 24 Dec 2025 00:44:49 +0100 Subject: selftests: kvm: replace numbered sync points with actions Rework the guest=>host syncs in the AMX test to use named actions instead of arbitrary, incrementing numbers. The "stage" of the test has no real meaning, what matters is what action the test wants the host to perform. The incrementing numbers are somewhat helpful for triaging failures, but fully debugging failures almost always requires a much deeper dive into the test (and KVM). Using named actions not only makes it easier to extend the test without having to shift all sync point numbers, it makes the code easier to read. [Commit message by Sean Christopherson] Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86/amx_test.c | 88 +++++++++++++++--------------- 1 file changed, 43 insertions(+), 45 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index f4ce5a185a7d..3de4402ac17d 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -124,6 +124,14 @@ static void set_tilecfg(struct tile_config *cfg) } } +enum { + /* Check TMM0 against tiledata */ + TEST_COMPARE_TILEDATA = 1, + + /* Full VM save/restore */ + TEST_SAVE_RESTORE = 2, +}; + static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, struct tile_data *tiledata, struct xstate *xstate) @@ -131,20 +139,20 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) && this_cpu_has(X86_FEATURE_OSXSAVE)); check_xtile_info(); - GUEST_SYNC(1); + GUEST_SYNC(TEST_SAVE_RESTORE); /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); - GUEST_SYNC(2); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); - GUEST_SYNC(3); + GUEST_SYNC(TEST_SAVE_RESTORE); /* Check save/restore when trap to userspace */ __tileloadd(tiledata); - GUEST_SYNC(4); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); __tilerelease(); - GUEST_SYNC(5); + GUEST_SYNC(TEST_SAVE_RESTORE); /* * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in * the xcomp_bv. @@ -154,6 +162,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA); + /* #NM test */ + /* xfd=0x40000, disable amx tiledata */ wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); @@ -166,13 +176,13 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA)); - GUEST_SYNC(6); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); /* Trigger #NM exception */ __tileloadd(tiledata); - GUEST_SYNC(10); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); GUEST_DONE(); } @@ -180,18 +190,18 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, void guest_nm_handler(struct ex_regs *regs) { /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ - GUEST_SYNC(7); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); - GUEST_SYNC(8); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); /* Clear xfd_err */ wrmsr(MSR_IA32_XFD_ERR, 0); /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); - GUEST_SYNC(9); + GUEST_SYNC(TEST_SAVE_RESTORE); } int main(int argc, char *argv[]) @@ -244,6 +254,7 @@ int main(int argc, char *argv[]) memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate); + int iter = 0; for (;;) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -253,20 +264,9 @@ int main(int argc, char *argv[]) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: - switch (uc.args[1]) { - case 1: - case 2: - case 3: - case 5: - case 6: - case 7: - case 8: - fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]); - break; - case 4: - case 10: - fprintf(stderr, - "GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]); + ++iter; + if (uc.args[1] & TEST_COMPARE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter); /* Compacted mode, get amx offset by xsave area * size subtract 8K amx size. @@ -279,11 +279,25 @@ int main(int argc, char *argv[]) ret = memcmp(amx_start, tiles_data, TILE_SIZE); TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); kvm_x86_state_cleanup(state); - break; - case 9: - fprintf(stderr, - "GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]); - break; + } + if (uc.args[1] & TEST_SAVE_RESTORE) { + fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter); + state = vcpu_save_state(vcpu); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vcpu, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vcpu, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); } break; case UCALL_DONE: @@ -293,22 +307,6 @@ int main(int argc, char *argv[]) TEST_FAIL("Unknown ucall %lu", uc.cmd); } - state = vcpu_save_state(vcpu); - memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vcpu, ®s1); - - kvm_vm_release(vm); - - /* Restore state in a new VM. */ - vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_load_state(vcpu, state); - kvm_x86_state_cleanup(state); - - memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vcpu, ®s2); - TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", - (ulong) regs2.rdi, (ulong) regs2.rsi); } done: kvm_vm_free(vm); -- cgit v1.2.3 From 0383a8edef396cf0a6884b0be81d62bde60737b0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 Dec 2025 16:47:26 +0100 Subject: selftests: kvm: try getting XFD and XSAVE state out of sync The host is allowed to set FPU state that includes a disabled xstate component. Check that this does not cause bad effects. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86/amx_test.c | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index 3de4402ac17d..bee56c1f7833 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -125,11 +125,17 @@ static void set_tilecfg(struct tile_config *cfg) } enum { + /* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */ + TEST_SAVE_TILEDATA = 1, + /* Check TMM0 against tiledata */ - TEST_COMPARE_TILEDATA = 1, + TEST_COMPARE_TILEDATA = 2, + + /* Restore TMM0 from earlier save */ + TEST_RESTORE_TILEDATA = 4, /* Full VM save/restore */ - TEST_SAVE_RESTORE = 2, + TEST_SAVE_RESTORE = 8, }; static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, @@ -150,7 +156,16 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_SYNC(TEST_SAVE_RESTORE); /* Check save/restore when trap to userspace */ __tileloadd(tiledata); - GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + + /* xfd=0x40000, disable amx tiledata */ + wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); + + /* host tries setting tiledata while guest XFD is set */ + GUEST_SYNC(TEST_RESTORE_TILEDATA); + GUEST_SYNC(TEST_SAVE_RESTORE); + + wrmsr(MSR_IA32_XFD, 0); __tilerelease(); GUEST_SYNC(TEST_SAVE_RESTORE); /* @@ -210,10 +225,10 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_x86_state *state; + struct kvm_x86_state *tile_state = NULL; int xsave_restore_size; vm_vaddr_t amx_cfg, tiledata, xstate; struct ucall uc; - u32 amx_offset; int ret; /* @@ -265,20 +280,27 @@ int main(int argc, char *argv[]) /* NOT REACHED */ case UCALL_SYNC: ++iter; + if (uc.args[1] & TEST_SAVE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter); + tile_state = vcpu_save_state(vcpu); + } if (uc.args[1] & TEST_COMPARE_TILEDATA) { fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter); /* Compacted mode, get amx offset by xsave area * size subtract 8K amx size. */ - amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; - state = vcpu_save_state(vcpu); - void *amx_start = (void *)state->xsave + amx_offset; + u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; + void *amx_start = (void *)tile_state->xsave + amx_offset; void *tiles_data = (void *)addr_gva2hva(vm, tiledata); /* Only check TMM0 register, 1 tile */ ret = memcmp(amx_start, tiles_data, TILE_SIZE); TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); - kvm_x86_state_cleanup(state); + } + if (uc.args[1] & TEST_RESTORE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter); + vcpu_xsave_set(vcpu, tile_state->xsave); + fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter); } if (uc.args[1] & TEST_SAVE_RESTORE) { fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter); -- cgit v1.2.3 From 3611ca7c12b740e250d83f8bbe3554b740c503b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 29 Dec 2025 12:23:30 -0800 Subject: selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1 Rework the AMX test's #NM handling to use kvm_asm_safe() to verify an #NM actually occurs. As is, a completely missing #NM could go unnoticed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86/amx_test.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index bee56c1f7833..37b166260ee3 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile) : : "a"(tile), "d"(0)); } +static inline int tileloadd_safe(void *tile) +{ + return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10", + "a"(tile), "d"(0)); +} + static inline void __tilerelease(void) { asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::); @@ -142,6 +148,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, struct tile_data *tiledata, struct xstate *xstate) { + int vector; + GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) && this_cpu_has(X86_FEATURE_OSXSAVE)); check_xtile_info(); @@ -195,17 +203,13 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); - /* Trigger #NM exception */ - __tileloadd(tiledata); - GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); - GUEST_DONE(); -} + /* Trigger #NM exception */ + vector = tileloadd_safe(tiledata); + __GUEST_ASSERT(vector == NM_VECTOR, + "Wanted #NM on tileloadd with XFD[18]=1, got %s", + ex_str(vector)); -void guest_nm_handler(struct ex_regs *regs) -{ - /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ - GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); @@ -217,6 +221,11 @@ void guest_nm_handler(struct ex_regs *regs) /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); GUEST_SYNC(TEST_SAVE_RESTORE); + + __tileloadd(tiledata); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -253,9 +262,6 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); - /* Register #NM handler */ - vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); - /* amx cfg for guest_code */ amx_cfg = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize()); -- cgit v1.2.3 From c39a6a277e0e67ffff6a8efcbbf7e7e23ce9e38c Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 8 Jan 2026 12:44:19 +0100 Subject: vsock/test: add a final full barrier after run all tests If the last test fails, the other side still completes correctly, which could lead to false positives. Let's add a final barrier that ensures that the last test has finished correctly on both sides, but also that the two sides agree on the number of tests to be performed. Fixes: 2f65b44e199c ("VSOCK: add full barrier between test cases") Reviewed-by: Luigi Leonardi Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20260108114419.52747-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/vsock/util.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index d843643ced6b..9430ef5b8bc3 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -511,6 +511,18 @@ void run_tests(const struct test_case *test_cases, printf("ok\n"); } + + printf("All tests have been executed. Waiting other peer..."); + fflush(stdout); + + /* + * Final full barrier, to ensure that all tests have been run and + * that even the last one has been successful on both sides. + */ + control_writeln("COMPLETED"); + control_expectln("COMPLETED"); + + printf("ok\n"); } void list_tests(const struct test_case *test_cases) -- cgit v1.2.3 From 9086984ff52e703cd7ce47ae19f12d8d31914396 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 9 Jan 2026 13:08:51 +0200 Subject: selftests: drv-net: psp: Better control the used PSP dev The PSP responder fails when zero or multiple PSP devices are detected. There's an option to select the device id to use (-d) but it's currently not used from the PSP self test. It's also hard to use because the PSP test doesn't dump the PSP devices so can't choose one. When zero devices are detected, psp_responder fails which will cause the parent test to fail as well instead of skipping PSP tests. Fix both of these problems. Change psp_responder to: - not fail when no PSP devs are detected. - get an optional -i ifindex argument instead of -d. - select the correct PSP dev from the dump corresponding to ifindex or - select the first PSP dev when -i is not given. - fail when multiple devs are found and -i is not given. - warn and continue when the requested ifindex is not found. Also plumb the ifindex from the Python test. With these, when there are no PSP devs found or the wrong one is chosen, psp_responder opens the server socket, listens for control connections normally, and leaves the skipping of the various test cases which require a PSP device (~most, but not all of them) to the parent test. This results in output like: ok 1 psp.test_case # SKIP No PSP devices found [...] ok 12 psp.dev_get_device # SKIP No PSP devices found ok 13 psp.dev_get_device_bad ok 14 psp.dev_rotate # SKIP No PSP devices found [...] Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Link: https://patch.msgid.link/20260109110851.2952906-2-cratiu@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 1 + tools/testing/selftests/drivers/net/psp.py | 4 +- .../testing/selftests/drivers/net/psp_responder.c | 50 ++++++++++------------ 3 files changed, 26 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 8b644fd84ff2..63495376e654 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -170,6 +170,7 @@ class NetDrvEpEnv(NetDrvEnvBase): self.remote_ifname = self.resolve_remote_ifc() self.remote_dev = ip("-d link show dev " + self.remote_ifname, host=self.remote, json=True)[0] + self.remote_ifindex = self.remote_dev['ifindex'] self._required_cmd = {} diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 52523bdad240..528a421ecf76 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -601,8 +601,8 @@ def main() -> None: cfg.comm_port = rand_port() srv = None try: - with bkg(responder + f" -p {cfg.comm_port}", host=cfg.remote, - exit_wait=True) as srv: + with bkg(responder + f" -p {cfg.comm_port} -i {cfg.remote_ifindex}", + host=cfg.remote, exit_wait=True) as srv: wait_port_listen(cfg.comm_port, host=cfg.remote) cfg.comm_sock = socket.create_connection((cfg.remote_addr, diff --git a/tools/testing/selftests/drivers/net/psp_responder.c b/tools/testing/selftests/drivers/net/psp_responder.c index f309e0d73cbf..a26e7628bbb1 100644 --- a/tools/testing/selftests/drivers/net/psp_responder.c +++ b/tools/testing/selftests/drivers/net/psp_responder.c @@ -22,7 +22,7 @@ static bool should_quit; struct opts { int port; - int devid; + int ifindex; bool verbose; }; @@ -360,7 +360,7 @@ static void usage(const char *name, const char *miss) if (miss) fprintf(stderr, "Missing argument: %s\n", miss); - fprintf(stderr, "Usage: %s -p port [-v] [-d psp-dev-id]\n", name); + fprintf(stderr, "Usage: %s -p port [-v] [-i ifindex]\n", name); exit(EXIT_FAILURE); } @@ -368,7 +368,7 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts) { int opt; - while ((opt = getopt(argc, argv, "vp:d:")) != -1) { + while ((opt = getopt(argc, argv, "vp:i:")) != -1) { switch (opt) { case 'v': opts->verbose = 1; @@ -376,8 +376,8 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts) case 'p': opts->port = atoi(optarg); break; - case 'd': - opts->devid = atoi(optarg); + case 'i': + opts->ifindex = atoi(optarg); break; default: usage(argv[0], NULL); @@ -410,12 +410,11 @@ static int psp_dev_set_ena(struct ynl_sock *ys, __u32 dev_id, __u32 versions) int main(int argc, char **argv) { struct psp_dev_get_list *dev_list; - bool devid_found = false; __u32 ver_ena, ver_cap; struct opts opts = {}; struct ynl_error yerr; struct ynl_sock *ys; - int first_id = 0; + int devid = -1; int ret; parse_cmd_opts(argc, argv, &opts); @@ -429,20 +428,19 @@ int main(int argc, char **argv) } dev_list = psp_dev_get_dump(ys); - if (ynl_dump_empty(dev_list)) { - if (ys->err.code) - goto err_close; - fprintf(stderr, "No PSP devices\n"); - goto err_close_silent; - } + if (ynl_dump_empty(dev_list) && ys->err.code) + goto err_close; ynl_dump_foreach(dev_list, d) { - if (opts.devid) { - devid_found = true; + if (opts.ifindex) { + if (d->ifindex != opts.ifindex) + continue; + devid = d->id; ver_ena = d->psp_versions_ena; ver_cap = d->psp_versions_cap; - } else if (!first_id) { - first_id = d->id; + break; + } else if (devid < 0) { + devid = d->id; ver_ena = d->psp_versions_ena; ver_cap = d->psp_versions_cap; } else { @@ -452,23 +450,21 @@ int main(int argc, char **argv) } psp_dev_get_list_free(dev_list); - if (opts.devid && !devid_found) { - fprintf(stderr, "PSP device %d requested on cmdline, not found\n", - opts.devid); - goto err_close_silent; - } else if (!opts.devid) { - opts.devid = first_id; - } + if (opts.ifindex && devid < 0) + fprintf(stderr, + "WARN: PSP device with ifindex %d requested on cmdline, not found\n", + opts.ifindex); - if (ver_ena != ver_cap) { - ret = psp_dev_set_ena(ys, opts.devid, ver_cap); + if (devid >= 0 && ver_ena != ver_cap) { + ret = psp_dev_set_ena(ys, devid, ver_cap); if (ret) goto err_close; } ret = run_responder(ys, &opts); - if (ver_ena != ver_cap && psp_dev_set_ena(ys, opts.devid, ver_ena)) + if (devid >= 0 && ver_ena != ver_cap && + psp_dev_set_ena(ys, devid, ver_ena)) fprintf(stderr, "WARN: failed to set the PSP versions back\n"); ynl_sock_destroy(ys); -- cgit v1.2.3 From de7c600e2d5b501c0c04bde8ebab89ac5888a69f Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 8 Jan 2026 15:45:21 -0800 Subject: selftests/net: parametrise iou-zcrx.py with ksft_variants Use ksft_variants to parametrise tests in iou-zcrx.py to either use single queues or RSS contexts, reducing duplication. Signed-off-by: David Wei Link: https://patch.msgid.link/20260108234521.3619621-1-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 162 ++++++++++----------- 1 file changed, 73 insertions(+), 89 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 712c806508b5..2c5acfb4f5dc 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -3,132 +3,114 @@ import re from os import path -from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import ksft_run, ksft_exit, KsftSkipEx, ksft_variants, KsftNamedVariant from lib.py import NetDrvEpEnv from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen +from lib.py import EthtoolFamily -def _get_current_settings(cfg): - output = ethtool(f"-g {cfg.ifname}", json=True)[0] - return (output['rx'], output['hds-thresh']) - - -def _get_combined_channels(cfg): - output = ethtool(f"-l {cfg.ifname}").stdout - values = re.findall(r'Combined:\s+(\d+)', output) - return int(values[1]) - - -def _create_rss_ctx(cfg, chan): - output = ethtool(f"-X {cfg.ifname} context new start {chan} equal 1").stdout +def create_rss_ctx(cfg): + output = ethtool(f"-X {cfg.ifname} context new start {cfg.target} equal 1").stdout values = re.search(r'New RSS context is (\d+)', output).group(1) - ctx_id = int(values) - return (ctx_id, defer(ethtool, f"-X {cfg.ifname} delete context {ctx_id}")) + return int(values) -def _set_flow_rule(cfg, port, chan): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {chan}").stdout +def set_flow_rule(cfg): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.target}").stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def _set_flow_rule_rss(cfg, port, ctx_id): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} context {ctx_id}").stdout +def set_flow_rule_rss(cfg, rss_ctx_id): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def test_zcrx(cfg) -> None: - cfg.require_ipver('6') - - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() - - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") +def single(cfg): + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + rx_rings = rings['rx'] + hds_thresh = rings.get('hds-thresh', 0) - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': hds_thresh, + 'rx': rx_rings}) - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") + cfg.target = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.target}") defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) + flow_rule_id = set_flow_rule(cfg) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" - with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") - cmd(tx_cmd, host=cfg.remote) +def rss(cfg): + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') -def test_zcrx_oneshot(cfg) -> None: - cfg.require_ipver('6') + rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + rx_rings = rings['rx'] + hds_thresh = rings.get('hds-thresh', 0) - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() + cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': hds_thresh, + 'rx': rx_rings}) - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") + cfg.target = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.target}") + defer(ethtool, f"-X {cfg.ifname} default") - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + rss_ctx_id = create_rss_ctx(cfg) + defer(ethtool, f"-X {cfg.ifname} delete context {rss_ctx_id}") - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + flow_rule_id = set_flow_rule_rss(cfg, rss_ctx_id) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") - defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") +@ksft_variants([ + KsftNamedVariant("single", single), + KsftNamedVariant("rss", rss), +]) +def test_zcrx(cfg, setup) -> None: + cfg.require_ipver('6') - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -o 4" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 4096 -z 16384" + setup(cfg) + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") + wait_port_listen(cfg.port, proto="tcp") cmd(tx_cmd, host=cfg.remote) -def test_zcrx_rss(cfg) -> None: +@ksft_variants([ + KsftNamedVariant("single", single), + KsftNamedVariant("rss", rss), +]) +def test_zcrx_oneshot(cfg, setup) -> None: cfg.require_ipver('6') - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() - - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") - - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") - - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") - - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") - defer(ethtool, f"-X {cfg.ifname} default") - - (ctx_id, delete_ctx) = _create_rss_ctx(cfg, combined_chans - 1) - flow_rule_id = _set_flow_rule_rss(cfg, port, ctx_id) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" + setup(cfg) + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -o 4" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 4096 -z 16384" with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") + wait_port_listen(cfg.port, proto="tcp") cmd(tx_cmd, host=cfg.remote) @@ -137,7 +119,9 @@ def main() -> None: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + cfg.ethnl = EthtoolFamily() + cfg.port = rand_port() + ksft_run(globs=globals(), cases=[test_zcrx, test_zcrx_oneshot], args=(cfg, )) ksft_exit() -- cgit v1.2.3 From 799a4912eea74c667da1c8167f93bf2d1508a89e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 8 Jan 2026 14:52:56 -0800 Subject: selftests: net: py: capitalize defer queue and improve import Import utils and refer to the global defer queue that way instead of importing the queue. This will make it possible to assign value to the global variable. While at it capitalize the name, to comply with the Python coding style. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260108225257.2684238-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 8 ++++---- tools/testing/selftests/net/lib/py/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 531e7fa1b3ea..248cd1a723a3 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -8,7 +8,7 @@ import time import traceback from collections import namedtuple from .consts import KSFT_MAIN_NAME -from .utils import global_defer_queue +from . import utils KSFT_RESULT = None KSFT_RESULT_ALL = True @@ -157,10 +157,10 @@ def ksft_flush_defer(): global KSFT_RESULT i = 0 - qlen_start = len(global_defer_queue) - while global_defer_queue: + qlen_start = len(utils.GLOBAL_DEFER_QUEUE) + while utils.GLOBAL_DEFER_QUEUE: i += 1 - entry = global_defer_queue.pop() + entry = utils.GLOBAL_DEFER_QUEUE.pop() try: entry.exec_only() except Exception: diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 106ee1f2df86..2dde34560d65 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -141,7 +141,7 @@ class bkg(cmd): return self.process(terminate=terminate, fail=self.check_fail) -global_defer_queue = [] +GLOBAL_DEFER_QUEUE = [] class defer: @@ -153,7 +153,7 @@ class defer: self.args = args self.kwargs = kwargs - self._queue = global_defer_queue + self._queue = GLOBAL_DEFER_QUEUE self._queue.append(self) def __enter__(self): -- cgit v1.2.3 From 7a1ff3545adeec5dc65c3063c2f084500d6f7014 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 8 Jan 2026 14:52:57 -0800 Subject: selftests: net: py: ensure defer() is only used within a test case I wasted a couple of hours recently after accidentally adding a defer() from within a function which itself was called as part of defer(). This leads to an infinite loop of defer(). Make sure this cannot happen and raise a helpful exception. I understand that the pair of _ksft_defer_arm() calls may not be the most Pythonic way to implement this, but it's easy enough to understand. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260108225257.2684238-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 7 +++++++ tools/testing/selftests/net/lib/py/utils.py | 3 +++ 2 files changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 248cd1a723a3..0a96f88bb60a 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -153,6 +153,11 @@ def ktap_result(ok, cnt=1, case_name="", comment=""): print(res, flush=True) +def _ksft_defer_arm(state): + """ Allow or disallow the use of defer() """ + utils.GLOBAL_DEFER_ARMED = state + + def ksft_flush_defer(): global KSFT_RESULT @@ -315,6 +320,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): comment = "" cnt_key = "" + _ksft_defer_arm(True) try: func(*args) except KsftSkipEx as e: @@ -332,6 +338,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): ksft_pr(f"Stopping tests due to {type(e).__name__}.") KSFT_RESULT = False cnt_key = 'fail' + _ksft_defer_arm(False) try: ksft_flush_defer() diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 2dde34560d65..824f039d384c 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -142,6 +142,7 @@ class bkg(cmd): GLOBAL_DEFER_QUEUE = [] +GLOBAL_DEFER_ARMED = False class defer: @@ -153,6 +154,8 @@ class defer: self.args = args self.kwargs = kwargs + if not GLOBAL_DEFER_ARMED: + raise Exception("defer queue not armed, did you use defer() outside of a test case?") self._queue = GLOBAL_DEFER_QUEUE self._queue.append(self) -- cgit v1.2.3 From 4203c6fb5e9d2e4fb9a48b421d92efd4429a4d55 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 12:44:57 +0100 Subject: selftests/nolibc: try to read from stdin in readv_zero test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When stdout is redirected to a file this test fails. This happens when running through the kselftest runner since commit d9e6269e3303 ("selftests/run_kselftest.sh: exit with error if tests fail"). For consistency with other tests that read from a file descriptor, switch to stdin over stdout. The tests are still brittle against a redirected stdin, but at least they are now consistently so. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-1-f82101c2c505@weissschuh.net --- tools/testing/selftests/nolibc/nolibc-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 3986d55a6ff6..e83c1e7e2beb 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1404,7 +1404,7 @@ int run_syscall(int min, int max) CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break; CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break; CASE_TEST(readv_badf); EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break; - CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break; + CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(0, NULL, 0)); break; CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break; CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break; CASE_TEST(ptrace); EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break; -- cgit v1.2.3 From 20c72de1f8a9e338531579bd784371aba4b7dd2c Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 12:44:58 +0100 Subject: selftests/nolibc: scope custom flags to the nolibc-test target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new target for 'libc-test' is going to be added which should not be affected by these options. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-2-f82101c2c505@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 40f5c2908dda..43f0b608c796 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -9,14 +9,10 @@ cc-option = $(call __cc-option, $(CC),,$(1),$(2)) include Makefile.include -CFLAGS = -nostdlib -nostdinc -static \ +$(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \ -isystem $(top_srcdir)/tools/include/nolibc -isystem $(top_srcdir)/usr/include \ $(CFLAGS_NOLIBC_TEST) - -ifeq ($(LLVM),) -LDLIBS := -lgcc -endif - +$(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc) $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers help: -- cgit v1.2.3 From 6fe8360b16acbfb50c703f52568cad46759be2ed Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 12:44:59 +0100 Subject: selftests/nolibc: also test libc-test through regular selftest framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hook up libc-test to the regular selftest build to make sure nolibc-test.c stays compatible with a normal libc. As the pattern rule from lib.mk does not handle compiling a target from a differently named source file, add an explicit rule definition. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-3-f82101c2c505@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 43f0b608c796..0370489d938b 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := nolibc-test +TEST_GEN_PROGS := nolibc-test libc-test include ../lib.mk include $(top_srcdir)/scripts/Makefile.compiler @@ -15,6 +15,10 @@ $(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \ $(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc) $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers +$(OUTPUT)/libc-test: nolibc-test.c nolibc-test-linkage.c + $(call msg,CC,,$@) + $(Q)$(LINK.c) $^ -o $@ + help: @echo "For the custom nolibc testsuite use '$(MAKE) -f Makefile.nolibc'; available targets:" @$(MAKE) -f Makefile.nolibc help -- cgit v1.2.3 From edaf30743185f6ed8e29dcb2f1d01e183c0b807b Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Mon, 5 Jan 2026 11:36:27 +0900 Subject: tools/nolibc: Add fread() to stdio.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a very basic version of fread() like we already have for fwrite(). Signed-off-by: Daniel Palmer Link: https://patch.msgid.link/20260105023629.1502801-2-daniel@thingy.jp Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/stdio.h | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 1f16dab2ac88..6904252df97d 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -170,7 +170,7 @@ int putchar(int c) } -/* fwrite(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ +/* fwrite(), fread(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ /* internal fwrite()-like function which only takes a size and returns 0 on * success or EOF on error. It automatically retries on short writes. @@ -204,6 +204,38 @@ size_t fwrite(const void *s, size_t size, size_t nmemb, FILE *stream) return written; } +/* internal fread()-like function which only takes a size and returns 0 on + * success or EOF on error. It automatically retries on short reads. + */ +static __attribute__((unused)) +int _fread(void *buf, size_t size, FILE *stream) +{ + int fd = fileno(stream); + ssize_t ret; + + while (size) { + ret = read(fd, buf, size); + if (ret <= 0) + return EOF; + size -= ret; + buf += ret; + } + return 0; +} + +static __attribute__((unused)) +size_t fread(void *s, size_t size, size_t nmemb, FILE *stream) +{ + size_t nread; + + for (nread = 0; nread < nmemb; nread++) { + if (_fread(s, size, stream) != 0) + break; + s += size; + } + return nread; +} + static __attribute__((unused)) int fputs(const char *s, FILE *stream) { -- cgit v1.2.3 From 109770cc81680b802ee983b09b61c3979240fd09 Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Mon, 5 Jan 2026 11:36:28 +0900 Subject: tools/nolibc: Add fseek() to stdio.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A very basic wrapper around lseek() that implements fseek(). Signed-off-by: Daniel Palmer Link: https://patch.msgid.link/20260105023629.1502801-3-daniel@thingy.jp Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/stdio.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 6904252df97d..233318b0d0f0 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -272,6 +272,25 @@ char *fgets(char *s, int size, FILE *stream) } +/* fseek */ +static __attribute__((unused)) +int fseek(FILE *stream, long offset, int whence) +{ + int fd = fileno(stream); + off_t ret; + + ret = lseek(fd, offset, whence); + + /* lseek() and fseek() differ in that lseek returns the new + * position or -1, fseek() returns either 0 or -1. + */ + if (ret >= 0) + return 0; + + return -1; +} + + /* minimal printf(). It supports the following formats: * - %[l*]{d,u,c,x,p} * - %s -- cgit v1.2.3 From a5f00be9b3b07d92c6689997403851a32e1874cc Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Mon, 5 Jan 2026 11:36:29 +0900 Subject: tools/nolibc: Add a simple test for writing to a FILE and reading it back MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test that exercises create->write->seek->read to check that using the stream functions (fwrite() etc) is not totally broken. The only edge cases this is testing for are: - Reading the file after writing but without rewinding reads nothing. - Trying to read more items than the file contains returns the count of fully read items. Signed-off-by: Daniel Palmer Link: https://patch.msgid.link/20260105023629.1502801-4-daniel@thingy.jp Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index e83c1e7e2beb..1b9d3b2e2491 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -878,6 +878,58 @@ int test_file_stream(void) return 0; } +int test_file_stream_wsr(void) +{ + const char dataout[] = "foo"; + const size_t datasz = sizeof(dataout); + char datain[datasz]; + int fd, r; + FILE *f; + + fd = open("/tmp", O_TMPFILE | O_RDWR, 0644); + if (fd == -1) + return -1; + + f = fdopen(fd, "w+"); + if (!f) + return -1; + + errno = 0; + r = fwrite(dataout, 1, datasz, f); + if (r != datasz) + return -1; + + /* Attempt to read from the file without rewinding, + * we should read 0 items. + */ + r = fread(datain, 1, datasz, f); + if (r) + return -1; + + /* Rewind the file to the start */ + r = fseek(f, 0, SEEK_SET); + if (r) + return -1; + + /* Attempt to read back more than was written to + * make sure we handle short reads properly. + * fread() should return the number of complete items. + */ + r = fread(datain, 1, datasz + 1, f); + if (r != datasz) + return -1; + + /* Data we read should match the data we just wrote */ + if (memcmp(datain, dataout, datasz) != 0) + return -1; + + r = fclose(f); + if (r) + return -1; + + return 0; +} + enum fork_type { FORK_STANDARD, FORK_VFORK, @@ -1352,6 +1404,7 @@ int run_syscall(int min, int max) CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break; CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break; CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break; + CASE_TEST(file_stream_wsr); EXPECT_SYSZR(1, test_file_stream_wsr()); break; CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; -- cgit v1.2.3 From 531b50e06aa7600f854a90b0f714f4e49ea2c1ac Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 26 Nov 2025 11:42:35 +0100 Subject: verification/rvgen: Adapt dot2k and templates after refactoring da_monitor.h Previous changes refactored the da_monitor header file to avoid using macros. This implies a few changes in how to import and use da_monitor helpers: DECLARE_DA_MON_(name, type) is substituted by #define RV_MON_TYPE RV_MON_ Update the rvgen templates to reflect the changes. Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20251126104241.291258-5-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- tools/verification/rvgen/rvgen/dot2k.py | 6 ++++-- .../rvgen/rvgen/templates/dot2k/main.c | 25 ++++++++-------------- 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py index ed0a3c901106..d618a842fc52 100644 --- a/tools/verification/rvgen/rvgen/dot2k.py +++ b/tools/verification/rvgen/rvgen/dot2k.py @@ -38,9 +38,9 @@ class dot2k(Monitor, Dot2c): handle = "handle_start_run_event" if self.monitor_type == "per_task": buff.append("\tstruct task_struct *p = /* XXX: how do I get p? */;"); - buff.append("\tda_%s_%s(p, %s%s);" % (handle, self.name, event, self.enum_suffix)); + buff.append("\tda_%s(p, %s%s);" % (handle, event, self.enum_suffix)); else: - buff.append("\tda_%s_%s(%s%s);" % (handle, self.name, event, self.enum_suffix)); + buff.append("\tda_%s(%s%s);" % (handle, event, self.enum_suffix)); buff.append("}") buff.append("") return '\n'.join(buff) @@ -66,6 +66,8 @@ class dot2k(Monitor, Dot2c): buff.append(" * Documentation/trace/rv/deterministic_automata.rst") buff.append(" */") buff.append("") + buff.append("#define MONITOR_NAME %s" % (self.name)) + buff.append("") return buff diff --git a/tools/verification/rvgen/rvgen/templates/dot2k/main.c b/tools/verification/rvgen/rvgen/templates/dot2k/main.c index e0fd1134bd85..a14e4f0883db 100644 --- a/tools/verification/rvgen/rvgen/templates/dot2k/main.c +++ b/tools/verification/rvgen/rvgen/templates/dot2k/main.c @@ -6,7 +6,6 @@ #include #include #include -#include #define MODULE_NAME "%%MODEL_NAME%%" @@ -20,15 +19,9 @@ * This is the self-generated part of the monitor. Generally, there is no need * to touch this section. */ +#define RV_MON_TYPE RV_MON_%%MONITOR_TYPE%% #include "%%MODEL_NAME%%.h" - -/* - * Declare the deterministic automata monitor. - * - * The rv monitor reference is needed for the monitor declaration. - */ -static struct rv_monitor rv_%%MODEL_NAME%%; -DECLARE_DA_MON_%%MONITOR_TYPE%%(%%MODEL_NAME%%, %%MIN_TYPE%%); +#include /* * This is the instrumentation part of the monitor. @@ -42,7 +35,7 @@ static int enable_%%MODEL_NAME%%(void) { int retval; - retval = da_monitor_init_%%MODEL_NAME%%(); + retval = da_monitor_init(); if (retval) return retval; @@ -53,33 +46,33 @@ static int enable_%%MODEL_NAME%%(void) static void disable_%%MODEL_NAME%%(void) { - rv_%%MODEL_NAME%%.enabled = 0; + rv_this.enabled = 0; %%TRACEPOINT_DETACH%% - da_monitor_destroy_%%MODEL_NAME%%(); + da_monitor_destroy(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_%%MODEL_NAME%% = { +static struct rv_monitor rv_this = { .name = "%%MODEL_NAME%%", .description = "%%DESCRIPTION%%", .enable = enable_%%MODEL_NAME%%, .disable = disable_%%MODEL_NAME%%, - .reset = da_monitor_reset_all_%%MODEL_NAME%%, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_%%MODEL_NAME%%(void) { - return rv_register_monitor(&rv_%%MODEL_NAME%%, %%PARENT%%); + return rv_register_monitor(&rv_this, %%PARENT%%); } static void __exit unregister_%%MODEL_NAME%%(void) { - rv_unregister_monitor(&rv_%%MODEL_NAME%%); + rv_unregister_monitor(&rv_this); } module_init(register_%%MODEL_NAME%%); -- cgit v1.2.3 From 3c5720b9ba3ee9b3ae238aeaf0340e4c9666330e Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 26 Nov 2025 11:42:36 +0100 Subject: verification/rvgen: Annotate DA functions with types Functions in automata.py, dot2c.py and dot2k.py don't have type annotations and it can get complicated to remember how to use them. Add minimal type annotations. Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20251126104241.291258-6-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- tools/verification/rvgen/rvgen/automata.py | 20 +++++++------- tools/verification/rvgen/rvgen/dot2c.py | 42 +++++++++++++++--------------- tools/verification/rvgen/rvgen/dot2k.py | 20 +++++++------- 3 files changed, 41 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/verification/rvgen/rvgen/automata.py b/tools/verification/rvgen/rvgen/automata.py index d9a3fe2b74bf..3f06aef8d4fd 100644 --- a/tools/verification/rvgen/rvgen/automata.py +++ b/tools/verification/rvgen/rvgen/automata.py @@ -28,7 +28,7 @@ class Automata: self.function = self.__create_matrix() self.events_start, self.events_start_run = self.__store_init_events() - def __get_model_name(self): + def __get_model_name(self) -> str: basename = ntpath.basename(self.__dot_path) if not basename.endswith(".dot") and not basename.endswith(".gv"): print("not a dot file") @@ -40,7 +40,7 @@ class Automata: return model_name - def __open_dot(self): + def __open_dot(self) -> list[str]: cursor = 0 dot_lines = [] try: @@ -60,13 +60,13 @@ class Automata: cursor += 1 return dot_lines - def __get_cursor_begin_states(self): + def __get_cursor_begin_states(self) -> int: cursor = 0 while self.__dot_lines[cursor].split()[0] != "{node": cursor += 1 return cursor - def __get_cursor_begin_events(self): + def __get_cursor_begin_events(self) -> int: cursor = 0 while self.__dot_lines[cursor].split()[0] != "{node": cursor += 1 @@ -76,7 +76,7 @@ class Automata: cursor += 1 return cursor - def __get_state_variables(self): + def __get_state_variables(self) -> tuple[list[str], str, list[str]]: # wait for node declaration states = [] final_states = [] @@ -116,7 +116,7 @@ class Automata: return states, initial_state, final_states - def __get_event_variables(self): + def __get_event_variables(self) -> list[str]: # here we are at the begin of transitions, take a note, we will return later. cursor = self.__get_cursor_begin_events() @@ -140,7 +140,7 @@ class Automata: return sorted(set(events)) - def __create_matrix(self): + def __create_matrix(self) -> list[list[str]]: # transform the array into a dictionary events = self.events states = self.states @@ -174,7 +174,7 @@ class Automata: return matrix - def __store_init_events(self): + def __store_init_events(self) -> tuple[list[bool], list[bool]]: events_start = [False] * len(self.events) events_start_run = [False] * len(self.events) for i, _ in enumerate(self.events): @@ -196,10 +196,10 @@ class Automata: events_start_run[i] = True return events_start, events_start_run - def is_start_event(self, event): + def is_start_event(self, event: str) -> bool: return self.events_start[self.events.index(event)] - def is_start_run_event(self, event): + def is_start_run_event(self, event: str) -> bool: # prefer handle_start_event if there if any(self.events_start): return False diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py index b9b6f14cc536..fd64174fcfad 100644 --- a/tools/verification/rvgen/rvgen/dot2c.py +++ b/tools/verification/rvgen/rvgen/dot2c.py @@ -35,7 +35,7 @@ class Dot2c(Automata): # cut off the last \n return string[:-1] - def __get_enum_states_content(self): + def __get_enum_states_content(self) -> list[str]: buff = [] buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix)) for state in self.states: @@ -49,7 +49,7 @@ class Dot2c(Automata): buff = self.__get_enum_states_content() return self.__buff_to_string(buff) - def format_states_enum(self): + def format_states_enum(self) -> list[str]: buff = [] buff.append("enum %s {" % self.enum_states_def) buff.append(self.get_enum_states_string()) @@ -57,7 +57,7 @@ class Dot2c(Automata): return buff - def __get_enum_events_content(self): + def __get_enum_events_content(self) -> list[str]: buff = [] first = True for event in self.events: @@ -75,7 +75,7 @@ class Dot2c(Automata): buff = self.__get_enum_events_content() return self.__buff_to_string(buff) - def format_events_enum(self): + def format_events_enum(self) -> list[str]: buff = [] buff.append("enum %s {" % self.enum_events_def) buff.append(self.get_enum_events_string()) @@ -83,7 +83,7 @@ class Dot2c(Automata): return buff - def get_minimun_type(self): + def get_minimun_type(self) -> str: min_type = "unsigned char" if self.states.__len__() > 255: @@ -97,7 +97,7 @@ class Dot2c(Automata): return min_type - def format_automaton_definition(self): + def format_automaton_definition(self) -> list[str]: min_type = self.get_minimun_type() buff = [] buff.append("struct %s {" % self.struct_automaton_def) @@ -109,12 +109,12 @@ class Dot2c(Automata): buff.append("};\n") return buff - def format_aut_init_header(self): + def format_aut_init_header(self) -> list[str]: buff = [] buff.append("static const struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def)) return buff - def __get_string_vector_per_line_content(self, buff): + def __get_string_vector_per_line_content(self, buff: list[str]) -> str: first = True string = "" for entry in buff: @@ -133,14 +133,14 @@ class Dot2c(Automata): def get_aut_init_states_string(self): return self.__get_string_vector_per_line_content(self.states) - def format_aut_init_events_string(self): + def format_aut_init_events_string(self) -> list[str]: buff = [] buff.append("\t.event_names = {") buff.append(self.get_aut_init_events_string()) buff.append("\t},") return buff - def format_aut_init_states_string(self): + def format_aut_init_states_string(self) -> list[str]: buff = [] buff.append("\t.state_names = {") buff.append(self.get_aut_init_states_string()) @@ -148,11 +148,11 @@ class Dot2c(Automata): return buff - def __get_max_strlen_of_states(self): + def __get_max_strlen_of_states(self) -> int: max_state_name = max(self.states, key = len).__len__() return max(max_state_name, self.invalid_state_str.__len__()) - def get_aut_init_function(self): + def get_aut_init_function(self) -> str: nr_states = self.states.__len__() nr_events = self.events.__len__() buff = [] @@ -180,7 +180,7 @@ class Dot2c(Automata): return self.__buff_to_string(buff) - def format_aut_init_function(self): + def format_aut_init_function(self) -> list[str]: buff = [] buff.append("\t.function = {") buff.append(self.get_aut_init_function()) @@ -188,17 +188,17 @@ class Dot2c(Automata): return buff - def get_aut_init_initial_state(self): + def get_aut_init_initial_state(self) -> str: return self.initial_state - def format_aut_init_initial_state(self): + def format_aut_init_initial_state(self) -> list[str]: buff = [] initial_state = self.get_aut_init_initial_state() buff.append("\t.initial_state = " + initial_state + self.enum_suffix + ",") return buff - def get_aut_init_final_states(self): + def get_aut_init_final_states(self) -> str: line = "" first = True for state in self.states: @@ -213,29 +213,29 @@ class Dot2c(Automata): line = line + '0' return line - def format_aut_init_final_states(self): + def format_aut_init_final_states(self) -> list[str]: buff = [] buff.append("\t.final_states = { %s }," % self.get_aut_init_final_states()) return buff - def __get_automaton_initialization_footer_string(self): + def __get_automaton_initialization_footer_string(self) -> str: footer = "};\n" return footer - def format_aut_init_footer(self): + def format_aut_init_footer(self) -> list[str]: buff = [] buff.append(self.__get_automaton_initialization_footer_string()) return buff - def format_invalid_state(self): + def format_invalid_state(self) -> list[str]: buff = [] buff.append("#define %s state_max%s\n" % (self.invalid_state_str, self.enum_suffix)) return buff - def format_model(self): + def format_model(self) -> list[str]: buff = [] buff += self.format_states_enum() buff += self.format_invalid_state() diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py index d618a842fc52..6128fe238430 100644 --- a/tools/verification/rvgen/rvgen/dot2k.py +++ b/tools/verification/rvgen/rvgen/dot2k.py @@ -21,10 +21,10 @@ class dot2k(Monitor, Dot2c): Dot2c.__init__(self, file_path, extra_params.get("model_name")) self.enum_suffix = "_%s" % self.name - def fill_monitor_type(self): + def fill_monitor_type(self) -> str: return self.monitor_type.upper() - def fill_tracepoint_handlers_skel(self): + def fill_tracepoint_handlers_skel(self) -> str: buff = [] for event in self.events: buff.append("static void handle_%s(void *data, /* XXX: fill header */)" % event) @@ -45,19 +45,19 @@ class dot2k(Monitor, Dot2c): buff.append("") return '\n'.join(buff) - def fill_tracepoint_attach_probe(self): + def fill_tracepoint_attach_probe(self) -> str: buff = [] for event in self.events: buff.append("\trv_attach_trace_probe(\"%s\", /* XXX: tracepoint */, handle_%s);" % (self.name, event)) return '\n'.join(buff) - def fill_tracepoint_detach_helper(self): + def fill_tracepoint_detach_helper(self) -> str: buff = [] for event in self.events: buff.append("\trv_detach_trace_probe(\"%s\", /* XXX: tracepoint */, handle_%s);" % (self.name, event)) return '\n'.join(buff) - def fill_model_h_header(self): + def fill_model_h_header(self) -> list[str]: buff = [] buff.append("/* SPDX-License-Identifier: GPL-2.0 */") buff.append("/*") @@ -71,7 +71,7 @@ class dot2k(Monitor, Dot2c): return buff - def fill_model_h(self): + def fill_model_h(self) -> str: # # Adjust the definition names # @@ -85,17 +85,17 @@ class dot2k(Monitor, Dot2c): return '\n'.join(buff) - def fill_monitor_class_type(self): + def fill_monitor_class_type(self) -> str: if self.monitor_type == "per_task": return "DA_MON_EVENTS_ID" return "DA_MON_EVENTS_IMPLICIT" - def fill_monitor_class(self): + def fill_monitor_class(self) -> str: if self.monitor_type == "per_task": return "da_monitor_id" return "da_monitor" - def fill_tracepoint_args_skel(self, tp_type): + def fill_tracepoint_args_skel(self, tp_type: str) -> str: buff = [] tp_args_event = [ ("char *", "state"), @@ -117,7 +117,7 @@ class dot2k(Monitor, Dot2c): buff.append(" TP_ARGS(%s)" % tp_args_c) return '\n'.join(buff) - def fill_main_c(self): + def fill_main_c(self) -> str: main_c = super().fill_main_c() min_type = self.get_minimun_type() -- cgit v1.2.3 From 0d2405a086a035cce1e0ba1aa0849bd2104a4d6b Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 26 Nov 2025 11:42:37 +0100 Subject: verification/dot2c: Remove __buff_to_string() and cleanup str.join() can do what __buff_to_string() does. Therefore replace __buff_to_string() to make the scripts more pythonic. Also clean and remove some intermediate functions. Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20251126104241.291258-7-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- tools/verification/rvgen/rvgen/dot2c.py | 35 ++++++--------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py index fd64174fcfad..24894411c3cd 100644 --- a/tools/verification/rvgen/rvgen/dot2c.py +++ b/tools/verification/rvgen/rvgen/dot2c.py @@ -26,15 +26,6 @@ class Dot2c(Automata): super().__init__(file_path, model_name) self.line_length = 100 - def __buff_to_string(self, buff): - string = "" - - for line in buff: - string = string + line + "\n" - - # cut off the last \n - return string[:-1] - def __get_enum_states_content(self) -> list[str]: buff = [] buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix)) @@ -45,14 +36,10 @@ class Dot2c(Automata): return buff - def get_enum_states_string(self): - buff = self.__get_enum_states_content() - return self.__buff_to_string(buff) - def format_states_enum(self) -> list[str]: buff = [] buff.append("enum %s {" % self.enum_states_def) - buff.append(self.get_enum_states_string()) + buff += self.__get_enum_states_content() buff.append("};\n") return buff @@ -71,14 +58,10 @@ class Dot2c(Automata): return buff - def get_enum_events_string(self): - buff = self.__get_enum_events_content() - return self.__buff_to_string(buff) - def format_events_enum(self) -> list[str]: buff = [] buff.append("enum %s {" % self.enum_events_def) - buff.append(self.get_enum_events_string()) + buff += self.__get_enum_events_content() buff.append("};\n") return buff @@ -127,23 +110,17 @@ class Dot2c(Automata): return string - def get_aut_init_events_string(self): - return self.__get_string_vector_per_line_content(self.events) - - def get_aut_init_states_string(self): - return self.__get_string_vector_per_line_content(self.states) - def format_aut_init_events_string(self) -> list[str]: buff = [] buff.append("\t.event_names = {") - buff.append(self.get_aut_init_events_string()) + buff.append(self.__get_string_vector_per_line_content(self.events)) buff.append("\t},") return buff def format_aut_init_states_string(self) -> list[str]: buff = [] buff.append("\t.state_names = {") - buff.append(self.get_aut_init_states_string()) + buff.append(self.__get_string_vector_per_line_content(self.states)) buff.append("\t},") return buff @@ -178,7 +155,7 @@ class Dot2c(Automata): line += "\n\t\t}," if linetoolong else " }," buff.append(line) - return self.__buff_to_string(buff) + return '\n'.join(buff) def format_aut_init_function(self) -> list[str]: buff = [] @@ -253,4 +230,4 @@ class Dot2c(Automata): def print_model_classic(self): buff = self.format_model() - print(self.__buff_to_string(buff)) + print('\n'.join(buff)) -- cgit v1.2.3 From 3d2bfeeef340c8494eba80e7a005159cac69c2f7 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 26 Nov 2025 11:42:38 +0100 Subject: verification/dot2c: Remove superfluous enum assignment and add last comma The header files generated by dot2c currently create enums for states and events assigning the first element to 0. This is superfluous as it happens automatically if no value is specified. Also it doesn't add a comma to the last enum elements, which slightly complicates the diff if states or events are added. Remove the assignment to 0 and add a comma to last elements, this simplifies the logic for the code generator. Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20251126104241.291258-8-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- kernel/trace/rv/monitors/nrp/nrp.h | 20 +++++++++--------- kernel/trace/rv/monitors/opid/opid.h | 22 ++++++++++---------- kernel/trace/rv/monitors/sco/sco.h | 12 +++++------ kernel/trace/rv/monitors/scpd/scpd.h | 12 +++++------ kernel/trace/rv/monitors/snep/snep.h | 16 +++++++-------- kernel/trace/rv/monitors/snroc/snroc.h | 12 +++++------ kernel/trace/rv/monitors/sssw/sssw.h | 20 +++++++++--------- kernel/trace/rv/monitors/sts/sts.h | 26 ++++++++++++------------ kernel/trace/rv/monitors/wip/wip.h | 12 +++++------ kernel/trace/rv/monitors/wwnr/wwnr.h | 12 +++++------ tools/verification/rvgen/rvgen/dot2c.py | 36 +++++++++++---------------------- 11 files changed, 94 insertions(+), 106 deletions(-) (limited to 'tools') diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h index c2ec83da2124..3270d4c0139f 100644 --- a/kernel/trace/rv/monitors/nrp/nrp.h +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -8,21 +8,21 @@ #define MONITOR_NAME nrp enum states_nrp { - preempt_irq_nrp = 0, + preempt_irq_nrp, any_thread_running_nrp, nested_preempt_nrp, rescheduling_nrp, - state_max_nrp + state_max_nrp, }; #define INVALID_STATE state_max_nrp enum events_nrp { - irq_entry_nrp = 0, + irq_entry_nrp, sched_need_resched_nrp, schedule_entry_nrp, schedule_entry_preempt_nrp, - event_max_nrp + event_max_nrp, }; struct automaton_nrp { @@ -38,38 +38,38 @@ static const struct automaton_nrp automaton_nrp = { "preempt_irq", "any_thread_running", "nested_preempt", - "rescheduling" + "rescheduling", }, .event_names = { "irq_entry", "sched_need_resched", "schedule_entry", - "schedule_entry_preempt" + "schedule_entry_preempt", }, .function = { { preempt_irq_nrp, preempt_irq_nrp, nested_preempt_nrp, - nested_preempt_nrp + nested_preempt_nrp, }, { any_thread_running_nrp, rescheduling_nrp, any_thread_running_nrp, - INVALID_STATE + INVALID_STATE, }, { nested_preempt_nrp, preempt_irq_nrp, any_thread_running_nrp, - any_thread_running_nrp + any_thread_running_nrp, }, { preempt_irq_nrp, rescheduling_nrp, any_thread_running_nrp, - any_thread_running_nrp + any_thread_running_nrp, }, }, .initial_state = preempt_irq_nrp, diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h index 5014f1b85ecf..092992514970 100644 --- a/kernel/trace/rv/monitors/opid/opid.h +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -8,25 +8,25 @@ #define MONITOR_NAME opid enum states_opid { - disabled_opid = 0, + disabled_opid, enabled_opid, in_irq_opid, irq_disabled_opid, preempt_disabled_opid, - state_max_opid + state_max_opid, }; #define INVALID_STATE state_max_opid enum events_opid { - irq_disable_opid = 0, + irq_disable_opid, irq_enable_opid, irq_entry_opid, preempt_disable_opid, preempt_enable_opid, sched_need_resched_opid, sched_waking_opid, - event_max_opid + event_max_opid, }; struct automaton_opid { @@ -43,7 +43,7 @@ static const struct automaton_opid automaton_opid = { "enabled", "in_irq", "irq_disabled", - "preempt_disabled" + "preempt_disabled", }, .event_names = { "irq_disable", @@ -52,7 +52,7 @@ static const struct automaton_opid automaton_opid = { "preempt_disable", "preempt_enable", "sched_need_resched", - "sched_waking" + "sched_waking", }, .function = { { @@ -62,7 +62,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, irq_disabled_opid, disabled_opid, - disabled_opid + disabled_opid, }, { irq_disabled_opid, @@ -71,7 +71,7 @@ static const struct automaton_opid automaton_opid = { preempt_disabled_opid, enabled_opid, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -80,7 +80,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, INVALID_STATE, in_irq_opid, - in_irq_opid + in_irq_opid, }, { INVALID_STATE, @@ -89,7 +89,7 @@ static const struct automaton_opid automaton_opid = { disabled_opid, INVALID_STATE, irq_disabled_opid, - INVALID_STATE + INVALID_STATE, }, { disabled_opid, @@ -98,7 +98,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, enabled_opid, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = disabled_opid, diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h index 06b1c420ce54..bac3beb51e72 100644 --- a/kernel/trace/rv/monitors/sco/sco.h +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -8,18 +8,18 @@ #define MONITOR_NAME sco enum states_sco { - thread_context_sco = 0, + thread_context_sco, scheduling_context_sco, - state_max_sco + state_max_sco, }; #define INVALID_STATE state_max_sco enum events_sco { - sched_set_state_sco = 0, + sched_set_state_sco, schedule_entry_sco, schedule_exit_sco, - event_max_sco + event_max_sco, }; struct automaton_sco { @@ -33,12 +33,12 @@ struct automaton_sco { static const struct automaton_sco automaton_sco = { .state_names = { "thread_context", - "scheduling_context" + "scheduling_context", }, .event_names = { "sched_set_state", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { thread_context_sco, scheduling_context_sco, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h index 4a725a68085a..d6329da2671b 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.h +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -8,19 +8,19 @@ #define MONITOR_NAME scpd enum states_scpd { - cant_sched_scpd = 0, + cant_sched_scpd, can_sched_scpd, - state_max_scpd + state_max_scpd, }; #define INVALID_STATE state_max_scpd enum events_scpd { - preempt_disable_scpd = 0, + preempt_disable_scpd, preempt_enable_scpd, schedule_entry_scpd, schedule_exit_scpd, - event_max_scpd + event_max_scpd, }; struct automaton_scpd { @@ -34,13 +34,13 @@ struct automaton_scpd { static const struct automaton_scpd automaton_scpd = { .state_names = { "cant_sched", - "can_sched" + "can_sched", }, .event_names = { "preempt_disable", "preempt_enable", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 753080dc5fa1..357520a5b3d1 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -8,19 +8,19 @@ #define MONITOR_NAME snep enum states_snep { - non_scheduling_context_snep = 0, + non_scheduling_context_snep, scheduling_contex_snep, - state_max_snep + state_max_snep, }; #define INVALID_STATE state_max_snep enum events_snep { - preempt_disable_snep = 0, + preempt_disable_snep, preempt_enable_snep, schedule_entry_snep, schedule_exit_snep, - event_max_snep + event_max_snep, }; struct automaton_snep { @@ -34,26 +34,26 @@ struct automaton_snep { static const struct automaton_snep automaton_snep = { .state_names = { "non_scheduling_context", - "scheduling_contex" + "scheduling_contex", }, .event_names = { "preempt_disable", "preempt_enable", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, INVALID_STATE, INVALID_STATE, - non_scheduling_context_snep + non_scheduling_context_snep, }, }, .initial_state = non_scheduling_context_snep, diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h index ada5ee08bdab..88b7328ad31a 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.h +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -8,18 +8,18 @@ #define MONITOR_NAME snroc enum states_snroc { - other_context_snroc = 0, + other_context_snroc, own_context_snroc, - state_max_snroc + state_max_snroc, }; #define INVALID_STATE state_max_snroc enum events_snroc { - sched_set_state_snroc = 0, + sched_set_state_snroc, sched_switch_in_snroc, sched_switch_out_snroc, - event_max_snroc + event_max_snroc, }; struct automaton_snroc { @@ -33,12 +33,12 @@ struct automaton_snroc { static const struct automaton_snroc automaton_snroc = { .state_names = { "other_context", - "own_context" + "own_context", }, .event_names = { "sched_set_state", "sched_switch_in", - "sched_switch_out" + "sched_switch_out", }, .function = { { INVALID_STATE, own_context_snroc, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h index 8409eaadc7e0..1a4b806061c3 100644 --- a/kernel/trace/rv/monitors/sssw/sssw.h +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -8,17 +8,17 @@ #define MONITOR_NAME sssw enum states_sssw { - runnable_sssw = 0, + runnable_sssw, signal_wakeup_sssw, sleepable_sssw, sleeping_sssw, - state_max_sssw + state_max_sssw, }; #define INVALID_STATE state_max_sssw enum events_sssw { - sched_set_state_runnable_sssw = 0, + sched_set_state_runnable_sssw, sched_set_state_sleepable_sssw, sched_switch_blocking_sssw, sched_switch_in_sssw, @@ -27,7 +27,7 @@ enum events_sssw { sched_switch_yield_sssw, sched_wakeup_sssw, signal_deliver_sssw, - event_max_sssw + event_max_sssw, }; struct automaton_sssw { @@ -43,7 +43,7 @@ static const struct automaton_sssw automaton_sssw = { "runnable", "signal_wakeup", "sleepable", - "sleeping" + "sleeping", }, .event_names = { "sched_set_state_runnable", @@ -54,7 +54,7 @@ static const struct automaton_sssw automaton_sssw = { "sched_switch_suspend", "sched_switch_yield", "sched_wakeup", - "signal_deliver" + "signal_deliver", }, .function = { { @@ -66,7 +66,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, runnable_sssw, runnable_sssw, - runnable_sssw + runnable_sssw, }, { INVALID_STATE, @@ -77,7 +77,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, signal_wakeup_sssw, signal_wakeup_sssw, - runnable_sssw + runnable_sssw, }, { runnable_sssw, @@ -88,7 +88,7 @@ static const struct automaton_sssw automaton_sssw = { sleeping_sssw, signal_wakeup_sssw, runnable_sssw, - sleepable_sssw + sleepable_sssw, }, { INVALID_STATE, @@ -99,7 +99,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, INVALID_STATE, runnable_sssw, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = runnable_sssw, diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h index 3779d7f99404..6f7b2d9d72e6 100644 --- a/kernel/trace/rv/monitors/sts/sts.h +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -8,26 +8,26 @@ #define MONITOR_NAME sts enum states_sts { - can_sched_sts = 0, + can_sched_sts, cant_sched_sts, disable_to_switch_sts, enable_to_exit_sts, in_irq_sts, scheduling_sts, switching_sts, - state_max_sts + state_max_sts, }; #define INVALID_STATE state_max_sts enum events_sts { - irq_disable_sts = 0, + irq_disable_sts, irq_enable_sts, irq_entry_sts, sched_switch_sts, schedule_entry_sts, schedule_exit_sts, - event_max_sts + event_max_sts, }; struct automaton_sts { @@ -46,7 +46,7 @@ static const struct automaton_sts automaton_sts = { "enable_to_exit", "in_irq", "scheduling", - "switching" + "switching", }, .event_names = { "irq_disable", @@ -54,7 +54,7 @@ static const struct automaton_sts automaton_sts = { "irq_entry", "sched_switch", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { @@ -63,7 +63,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, scheduling_sts, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -71,7 +71,7 @@ static const struct automaton_sts automaton_sts = { cant_sched_sts, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -79,7 +79,7 @@ static const struct automaton_sts automaton_sts = { in_irq_sts, switching_sts, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { enable_to_exit_sts, @@ -87,7 +87,7 @@ static const struct automaton_sts automaton_sts = { enable_to_exit_sts, INVALID_STATE, INVALID_STATE, - can_sched_sts + can_sched_sts, }, { INVALID_STATE, @@ -95,7 +95,7 @@ static const struct automaton_sts automaton_sts = { in_irq_sts, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { disable_to_switch_sts, @@ -103,7 +103,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -111,7 +111,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = can_sched_sts, diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index cfdc52975354..b4c3eea94c86 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -8,18 +8,18 @@ #define MONITOR_NAME wip enum states_wip { - preemptive_wip = 0, + preemptive_wip, non_preemptive_wip, - state_max_wip + state_max_wip, }; #define INVALID_STATE state_max_wip enum events_wip { - preempt_disable_wip = 0, + preempt_disable_wip, preempt_enable_wip, sched_waking_wip, - event_max_wip + event_max_wip, }; struct automaton_wip { @@ -33,12 +33,12 @@ struct automaton_wip { static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", - "non_preemptive" + "non_preemptive", }, .event_names = { "preempt_disable", "preempt_enable", - "sched_waking" + "sched_waking", }, .function = { { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index 85d12e42a955..a28006512c9b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -8,18 +8,18 @@ #define MONITOR_NAME wwnr enum states_wwnr { - not_running_wwnr = 0, + not_running_wwnr, running_wwnr, - state_max_wwnr + state_max_wwnr, }; #define INVALID_STATE state_max_wwnr enum events_wwnr { - switch_in_wwnr = 0, + switch_in_wwnr, switch_out_wwnr, wakeup_wwnr, - event_max_wwnr + event_max_wwnr, }; struct automaton_wwnr { @@ -33,12 +33,12 @@ struct automaton_wwnr { static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", - "running" + "running", }, .event_names = { "switch_in", "switch_out", - "wakeup" + "wakeup", }, .function = { { running_wwnr, INVALID_STATE, not_running_wwnr }, diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py index 24894411c3cd..06a26bf15a7e 100644 --- a/tools/verification/rvgen/rvgen/dot2c.py +++ b/tools/verification/rvgen/rvgen/dot2c.py @@ -28,11 +28,11 @@ class Dot2c(Automata): def __get_enum_states_content(self) -> list[str]: buff = [] - buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix)) + buff.append("\t%s%s," % (self.initial_state, self.enum_suffix)) for state in self.states: if state != self.initial_state: buff.append("\t%s%s," % (state, self.enum_suffix)) - buff.append("\tstate_max%s" % (self.enum_suffix)) + buff.append("\tstate_max%s," % (self.enum_suffix)) return buff @@ -46,15 +46,10 @@ class Dot2c(Automata): def __get_enum_events_content(self) -> list[str]: buff = [] - first = True for event in self.events: - if first: - buff.append("\t%s%s = 0," % (event, self.enum_suffix)) - first = False - else: - buff.append("\t%s%s," % (event, self.enum_suffix)) + buff.append("\t%s%s," % (event, self.enum_suffix)) - buff.append("\tevent_max%s" % self.enum_suffix) + buff.append("\tevent_max%s," % self.enum_suffix) return buff @@ -97,18 +92,11 @@ class Dot2c(Automata): buff.append("static const struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def)) return buff - def __get_string_vector_per_line_content(self, buff: list[str]) -> str: - first = True - string = "" - for entry in buff: - if first: - string = string + "\t\t\"" + entry - first = False; - else: - string = string + "\",\n\t\t\"" + entry - string = string + "\"" - - return string + def __get_string_vector_per_line_content(self, entries: list[str]) -> str: + buff = [] + for entry in entries: + buff.append(f"\t\t\"{entry}\",") + return "\n".join(buff) def format_aut_init_events_string(self) -> list[str]: buff = [] @@ -152,7 +140,7 @@ class Dot2c(Automata): if y != nr_events-1: line += ",\n" if linetoolong else ", " else: - line += "\n\t\t}," if linetoolong else " }," + line += ",\n\t\t}," if linetoolong else " }," buff.append(line) return '\n'.join(buff) @@ -179,12 +167,12 @@ class Dot2c(Automata): line = "" first = True for state in self.states: - if first == False: + if not first: line = line + ', ' else: first = False - if self.final_states.__contains__(state): + if state in self.final_states: line = line + '1' else: line = line + '0' -- cgit v1.2.3 From 3fee5b320c15c8f61e44729a9513347de6a93735 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 26 Nov 2025 11:42:39 +0100 Subject: verification/rvgen: Remove unused variable declaration from containers The monitor container source files contained a declaration and a definition for the rv_monitor variable. The former is superfluous and can be removed. Remove the variable declaration from the template as well as the existing monitor containers. Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20251126104241.291258-9-gmonaco@redhat.com Signed-off-by: Gabriele Monaco --- kernel/trace/rv/monitors/rtapp/rtapp.c | 2 -- kernel/trace/rv/monitors/sched/sched.c | 2 -- tools/verification/rvgen/rvgen/templates/container/main.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'tools') diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c index fd75fc927d65..17f271231c99 100644 --- a/kernel/trace/rv/monitors/rtapp/rtapp.c +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -8,8 +8,6 @@ #include "rtapp.h" -struct rv_monitor rv_rtapp; - struct rv_monitor rv_rtapp = { .name = "rtapp", .description = "Collection of monitors for detecting problems with real-time applications", diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index d04db4b543f9..dd9d96fc6e21 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -8,8 +8,6 @@ #include "sched.h" -struct rv_monitor rv_sched; - struct rv_monitor rv_sched = { .name = "sched", .description = "container for several scheduler monitor specifications.", diff --git a/tools/verification/rvgen/rvgen/templates/container/main.c b/tools/verification/rvgen/rvgen/templates/container/main.c index 7d9b2f95c7e9..5fc89b46f279 100644 --- a/tools/verification/rvgen/rvgen/templates/container/main.c +++ b/tools/verification/rvgen/rvgen/templates/container/main.c @@ -8,8 +8,6 @@ #include "%%MODEL_NAME%%.h" -struct rv_monitor rv_%%MODEL_NAME%%; - struct rv_monitor rv_%%MODEL_NAME%% = { .name = "%%MODEL_NAME%%", .description = "%%DESCRIPTION%%", -- cgit v1.2.3 From bbf8c67aa6ae8bd588f097510d887dad071f9f43 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 8 Jan 2026 19:38:36 +0800 Subject: tools: jobserver: Prevent deadlock caused by incorrect jobserver configuration and enhance error reporting When using GNU Make's jobserver feature in kernel builds, a bug in MAKEFLAGS propagation caused "--jobserver-auth=r,w" to reference an unintended file descriptor. This led to infinite loops in jobserver-exec's os.read() calls due to empty token. My shell opened /etc/passwd for some reason without closing it, and as a result, all child processes inherited this fd 3. $ ls -l /proc/self/fd total 0 lrwx------ 1 changbin changbin 64 Dec 25 13:03 0 -> /dev/pts/1 lrwx------ 1 changbin changbin 64 Dec 25 13:03 1 -> /dev/pts/1 lrwx------ 1 changbin changbin 64 Dec 25 13:03 2 -> /dev/pts/1 lr-x------ 1 changbin changbin 64 Dec 25 13:03 3 -> /etc/passwd lr-x------ 1 changbin changbin 64 Dec 25 13:03 4 -> /proc/1421383/fd In this case, the `make` should open a new file descriptor for jobserver control, but clearly, it did not do so and instead still passed fd 3 as "--jobserver-auth=3,4" in MAKEFLAGS. (The version of my gnu make is 4.3) This update ensures robustness against invalid jobserver configurations, even when `make` incorrectly pass non-pipe file descriptors. * Rejecting empty reads to prevent infinite loops on EOF. * Clearing `self.jobs` to avoid writing to incorrect files if invalid tokens are detected. * Printing detailed error messages to stderr to inform the user. Cc: Mauro Carvalho Chehab Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Changbin Du Signed-off-by: Jonathan Corbet Message-ID: <20260108113836.2976527-1-changbin.du@huawei.com> --- tools/lib/python/jobserver.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index a24f30ef4fa8..616411087725 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -91,6 +91,10 @@ class JobserverExec: while True: try: slot = os.read(self.reader, 8) + if not slot: + # Clear self.jobs to prevent us from probably writing incorrect file. + self.jobs = b"" + raise ValueError("unexpected empty token from jobserver fd, invalid '--jobserver-auth=' setting?") self.jobs += slot except (OSError, IOError) as e: if e.errno == errno.EWOULDBLOCK: @@ -105,7 +109,8 @@ class JobserverExec: # to sit here blocked on our child. self.claim = len(self.jobs) + 1 - except (KeyError, IndexError, ValueError, OSError, IOError): + except (KeyError, IndexError, ValueError, OSError, IOError) as e: + print(f"jobserver: warning: {repr(e)}", file=sys.stderr) # Any missing environment strings or bad fds should result in just # not being parallel. self.claim = None -- cgit v1.2.3 From c1d7c0f9cdf6690eff4518f1c17a37d5ee647cd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:40 -0700 Subject: selftests: ublk: display UBLK_F_INTEGRITY support Add support for printing the UBLK_F_INTEGRITY feature flag in the human-readable kublk features output. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 185ba553686a..261095f19c93 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1454,6 +1454,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), + FEAT_NAME(UBLK_F_INTEGRITY), }; struct ublk_dev *dev; __u64 features = 0; -- cgit v1.2.3 From 261b67f4e34716e793b0b95d2722b2fe780ed5f4 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:41 -0700 Subject: selftests: ublk: add utility to get block device metadata size Some block device integrity parameters are available in sysfs, but others are only accessible using the FS_IOC_GETLBMD_CAP ioctl. Add a metadata_size utility program to print out the logical block metadata size, PI offset, and PI size within the metadata. Example output: $ metadata_size /dev/ublkb0 metadata_size: 64 pi_offset: 56 pi_tuple_size: 8 Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 5 ++-- tools/testing/selftests/ublk/metadata_size.c | 36 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ublk/metadata_size.c (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 06ba6fde098d..351ac6438561 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -49,12 +49,13 @@ TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh -TEST_GEN_PROGS_EXTENDED = kublk +TEST_GEN_PROGS_EXTENDED = kublk metadata_size +STANDALONE_UTILS := metadata_size.c LOCAL_HDRS += $(wildcard *.h) include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c) +$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/metadata_size.c b/tools/testing/selftests/ublk/metadata_size.c new file mode 100644 index 000000000000..76ecddf04d25 --- /dev/null +++ b/tools/testing/selftests/ublk/metadata_size.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + struct logical_block_metadata_cap cap = {}; + const char *filename; + int fd; + int result; + + if (argc != 2) { + fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]); + return 1; + } + + filename = argv[1]; + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror(filename); + return 1; + } + + result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap); + if (result < 0) { + perror("ioctl"); + return 1; + } + + printf("metadata_size: %u\n", cap.lbmd_size); + printf("pi_offset: %u\n", cap.lbmd_pi_offset); + printf("pi_tuple_size: %u\n", cap.lbmd_pi_size); + return 0; +} -- cgit v1.2.3 From 6ed6476c4aefa9ee3ba90f39bcc002dd034f6e03 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:42 -0700 Subject: selftests: ublk: add kublk support for integrity params Add integrity param command line arguments to kublk. Plumb these to struct ublk_params for the null and fault_inject targets, as they don't need to actually read or write the integrity data. Forbid the integrity params for loop or stripe until the integrity data copy is implemented. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/fault_inject.c | 1 + tools/testing/selftests/ublk/file_backed.c | 4 +++ tools/testing/selftests/ublk/kublk.c | 47 +++++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 21 +++++++++++++ tools/testing/selftests/ublk/null.c | 1 + tools/testing/selftests/ublk/stripe.c | 4 +++ 6 files changed, 78 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index b227bd78b252..3b897f69c014 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, .dev_sectors = dev_size >> 9, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); return 0; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 269d5f124e06..c14ce6608696 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -158,6 +158,10 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } ret = backing_file_tgt_init(dev); if (ret) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 261095f19c93..48e1865b4875 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -3,6 +3,7 @@ * Description: uring_cmd based ublk */ +#include #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -1550,6 +1551,8 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " + "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1613,6 +1616,12 @@ int main(int argc, char *argv[]) { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, { "no_ublk_fixed_fd", 0, NULL, 0 }, + { "integrity_capable", 0, NULL, 0 }, + { "integrity_reftag", 0, NULL, 0 }, + { "metadata_size", 1, NULL, 0 }, + { "pi_offset", 1, NULL, 0 }, + { "csum_type", 1, NULL, 0 }, + { "tag_size", 1, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1623,6 +1632,7 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", + .csum_type = LBMD_PI_CSUM_NONE, }; int ret = -EINVAL, i; int tgt_argc = 1; @@ -1697,6 +1707,28 @@ int main(int argc, char *argv[]) ctx.per_io_tasks = 1; if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) ctx.no_ublk_fixed_fd = 1; + if (!strcmp(longopts[option_idx].name, "integrity_capable")) + ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY; + if (!strcmp(longopts[option_idx].name, "integrity_reftag")) + ctx.integrity_flags |= LBMD_PI_CAP_REFTAG; + if (!strcmp(longopts[option_idx].name, "metadata_size")) + ctx.metadata_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "pi_offset")) + ctx.pi_offset = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "csum_type")) { + if (!strcmp(optarg, "ip")) { + ctx.csum_type = LBMD_PI_CSUM_IP; + } else if (!strcmp(optarg, "t10dif")) { + ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF; + } else if (!strcmp(optarg, "nvme")) { + ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME; + } else { + ublk_err("invalid csum_type: %s\n", optarg); + return -EINVAL; + } + } + if (!strcmp(longopts[option_idx].name, "tag_size")) + ctx.tag_size = strtoul(optarg, NULL, 0); break; case '?': /* @@ -1739,6 +1771,21 @@ int main(int argc, char *argv[]) return -EINVAL; } + if (ctx.metadata_size) { + if (!(ctx.flags & UBLK_F_USER_COPY)) { + ublk_err("integrity requires user_copy\n"); + return -EINVAL; + } + + ctx.flags |= UBLK_F_INTEGRITY; + } else if (ctx.integrity_flags || + ctx.pi_offset || + ctx.csum_type != LBMD_PI_CSUM_NONE || + ctx.tag_size) { + ublk_err("integrity parameters require metadata_size\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8a83b90ec603..d00f2b465cdf 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,6 +78,11 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + __u32 integrity_flags; + __u8 metadata_size; + __u8 pi_offset; + __u8 csum_type; + __u8 tag_size; int _evtfd; int _shmid; @@ -202,6 +207,22 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, + struct ublk_params *params) +{ + if (!ctx->metadata_size) + return; + + params->types |= UBLK_PARAM_TYPE_INTEGRITY; + params->integrity = (struct ublk_param_integrity) { + .flags = ctx->integrity_flags, + .interval_exp = params->basic.logical_bs_shift, + .metadata_size = ctx->metadata_size, + .pi_offset = ctx->pi_offset, + .csum_type = ctx->csum_type, + .tag_size = ctx->tag_size, + }; +} static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 280043f6b689..3aa162f08476 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_segments = 32, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index fd412e1f01c0..d4aaf3351d71 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -298,6 +298,10 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); -- cgit v1.2.3 From 24f8a44b797f03dfadb455138930523599d3c22a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:43 -0700 Subject: selftests: ublk: implement integrity user copy in kublk If integrity data is enabled for kublk, allocate an integrity buffer for each I/O. Extend ublk_user_copy() to copy the integrity data between the ublk request and the integrity buffer if the ublksrv_io_desc indicates that the request has integrity data. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 41 +++++++++++++++++++++++++++++++----- tools/testing/selftests/ublk/kublk.h | 14 ++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 48e1865b4875..d95937dd6167 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -416,8 +416,10 @@ static void ublk_queue_deinit(struct ublk_queue *q) if (q->io_cmd_buf) munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); - for (i = 0; i < nr_ios; i++) + for (i = 0; i < nr_ios; i++) { free(q->ios[i].buf_addr); + free(q->ios[i].integrity_buf); + } } static void ublk_thread_deinit(struct ublk_thread *t) @@ -433,12 +435,13 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, + __u8 metadata_size) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; int i; - int cmd_buf_size, io_buf_size; + int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; q->tgt_ops = dev->tgt.ops; @@ -446,6 +449,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) q->q_depth = depth; q->flags = dev->dev_info.flags; q->flags |= extra_flags; + q->metadata_size = metadata_size; /* Cache fd in queue for fast path access */ q->ublk_fd = dev->fds[0]; @@ -461,11 +465,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) } io_buf_size = dev->dev_info.max_io_buf_bytes; + integrity_size = ublk_integrity_len(q, io_buf_size); for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; q->ios[i].tag = i; + if (integrity_size) { + q->ios[i].integrity_buf = malloc(integrity_size); + if (!q->ios[i].integrity_buf) { + ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n", + dev->dev_info.dev_id, q->q_id, i, + integrity_size); + goto fail; + } + } + + if (ublk_queue_no_buf(q)) continue; @@ -608,13 +624,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) __u8 ublk_op = ublksrv_get_op(iod); __u32 len = iod->nr_sectors << 9; void *addr = io->buf_addr; + ssize_t copied; if (ublk_op != match_ublk_op) return; while (len) { __u32 copy_len = min(len, UBLK_USER_COPY_LEN); - ssize_t copied; if (ublk_op == UBLK_IO_OP_WRITE) copied = pread(q->ublk_fd, addr, copy_len, off); @@ -627,6 +643,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) off += copy_len; len -= copy_len; } + + if (!(iod->op_flags & UBLK_IO_F_INTEGRITY)) + return; + + len = ublk_integrity_len(q, iod->nr_sectors << 9); + off = ublk_user_copy_offset(q->q_id, io->tag); + off |= UBLKSRV_IO_INTEGRITY_FLAG; + if (ublk_op == UBLK_IO_OP_WRITE) + copied = pread(q->ublk_fd, io->integrity_buf, len, off); + else if (ublk_op == UBLK_IO_OP_READ) + copied = pwrite(q->ublk_fd, io->integrity_buf, len, off); + else + assert(0); + assert(copied == (ssize_t)len); } int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) @@ -1013,7 +1043,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->q[i].dev = dev; dev->q[i].q_id = i; - ret = ublk_queue_init(&dev->q[i], extra_flags); + ret = ublk_queue_init(&dev->q[i], extra_flags, + ctx->metadata_size); if (ret) { ublk_err("ublk dev %d queue %d init queue failed\n", dinfo->dev_id, i); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index d00f2b465cdf..830b49a7716a 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -112,6 +112,7 @@ struct ublk_ctrl_cmd_data { struct ublk_io { char *buf_addr; + void *integrity_buf; #define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) #define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) @@ -175,6 +176,7 @@ struct ublk_queue { #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) __u64 flags; int ublk_fd; /* cached ublk char device fd */ + __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; @@ -224,6 +226,18 @@ static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, }; } +static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len) +{ + /* All targets currently use interval_exp = logical_bs_shift = 9 */ + return (len >> 9) * q->metadata_size; +} + +static inline size_t +ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len) +{ + return (integrity_len / q->metadata_size) << 9; +} + static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF); -- cgit v1.2.3 From a1805442674b85ff9d626965f828e4fd71a82b28 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:44 -0700 Subject: selftests: ublk: support non-O_DIRECT backing files A subsequent commit will add support for using a backing file to store integrity data. Since integrity data is accessed in intervals of metadata_size, which may be much smaller than a logical block on the backing device, direct I/O cannot be used. Add an argument to backing_file_tgt_init() to specify the number of files to open for direct I/O. The remaining files will use buffered I/O. For now, continue to request direct I/O for all the files. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/common.c | 4 ++-- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index 01580a6f8519..d9873d4d50d0 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -12,7 +12,7 @@ void backing_file_tgt_deinit(struct ublk_dev *dev) } } -int backing_file_tgt_init(struct ublk_dev *dev) +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; @@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev) ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); - fd = open(file, O_RDWR | O_DIRECT); + fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0)); if (fd < 0) { ublk_err("%s: backing file %s can't be opened: %s\n", __func__, file, strerror(errno)); diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c14ce6608696..db4c176a4f28 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -163,7 +163,7 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) return -EINVAL; } - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, 1); if (ret) return ret; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 830b49a7716a..96c66b337bc0 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -462,6 +462,6 @@ extern const struct ublk_tgt_ops stripe_tgt_ops; extern const struct ublk_tgt_ops fault_inject_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); -int backing_file_tgt_init(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct); #endif diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index d4aaf3351d71..2be1c36438e7 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -315,7 +315,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) chunk_shift = ilog2(chunk_size); - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files); if (ret) return ret; -- cgit v1.2.3 From f48250dc5ba8368ccb587093eb20d1c7baecaacf Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:45 -0700 Subject: selftests: ublk: add integrity data support to loop target To perform and end-to-end test of integrity information through a ublk device, we need to actually store it somewhere and retrieve it. Add this support to kublk's loop target. It uses a second backing file for the integrity data corresponding to the data stored in the first file. The integrity file is initialized with byte 0xFF, which ensures the app and reference tags are set to the "escape" pattern to disable the bio-integrity-auto guard and reftag checks until the blocks are written. The integrity file is opened without O_DIRECT since it will be accessed at sub-block granularity. Each incoming read/write results in a pair of reads/writes, one to the data file, and one to the integrity file. If either backing I/O fails, the error is propagated to the ublk request. If both backing I/Os read/write some bytes, the ublk request is completed with the smaller of the number of blocks accessed by each I/O. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 92 ++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index db4c176a4f28..c3ce5ff72422 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -35,9 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, unsigned auto_zc = ublk_queue_use_auto_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc); struct ublk_io *io = ublk_get_io(q, tag); + __u64 offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { + ublk_io_alloc_sqes(t, sqe, 1); + /* Use second backing file for integrity data */ + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2), + io->integrity_buf, + ublk_integrity_len(q, len), + ublk_integrity_len(q, offset)); + sqe[0]->flags = IOSQE_FIXED_FILE; + /* tgt_data = 1 indicates integrity I/O */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1); + } + if (!zc || auto_zc) { ublk_io_alloc_sqes(t, sqe, 1); if (!sqe[0]) @@ -45,14 +59,14 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); if (auto_zc) sqe[0]->buf_index = tag; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - return 1; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1; } ublk_io_alloc_sqes(t, sqe, 3); @@ -63,8 +77,8 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); sqe[1]->buf_index = tag; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -72,7 +86,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); - return 2; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; } static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag) @@ -119,12 +133,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); - if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { - if (!io->result) - io->result = cqe->res; - if (cqe->res < 0) - ublk_err("%s: io failed op %x user_data %lx\n", - __func__, op, cqe->user_data); + if (cqe->res < 0) { + io->result = cqe->res; + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + __s32 data_len = user_data_to_tgt_data(cqe->user_data) + ? ublk_integrity_data_len(q, cqe->res) + : cqe->res; + + if (!io->result || data_len < io->result) + io->result = data_len; } /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ @@ -135,9 +154,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, ublk_complete_io(t, q, tag, io->result); } +static int ublk_loop_memset_file(int fd, __u8 byte, size_t len) +{ + off_t offset = 0; + __u8 buf[4096]; + + memset(buf, byte, sizeof(buf)); + while (len) { + int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset); + + if (ret < 0) + return -errno; + if (!ret) + return -EIO; + + len -= ret; + offset += ret; + } + return 0; +} + static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { unsigned long long bytes; + unsigned long blocks; int ret; struct ublk_params p = { .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, @@ -154,23 +194,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) }, }; + ublk_set_integrity_params(ctx, &p); if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } - if (ctx->metadata_size) { - ublk_err("%s: integrity not supported\n", __func__); - return -EINVAL; - } + /* Use O_DIRECT only for data file */ ret = backing_file_tgt_init(dev, 1); if (ret) return ret; - if (dev->tgt.nr_backing_files != 1) + /* Expect a second file for integrity data */ + if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size) return -EINVAL; - bytes = dev->tgt.backing_file_size[0]; + blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift; + if (ctx->metadata_size) { + unsigned long metadata_blocks = + dev->tgt.backing_file_size[1] / ctx->metadata_size; + unsigned long integrity_len; + + /* Ensure both data and integrity data fit in backing files */ + blocks = min(blocks, metadata_blocks); + integrity_len = blocks * ctx->metadata_size; + /* + * Initialize PI app tag and ref tag to 0xFF + * to disable bio-integrity-auto checks + */ + ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len); + if (ret) + return ret; + } + bytes = blocks << p.basic.logical_bs_shift; dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; -- cgit v1.2.3 From 9e9f635525b12f055558a7cfe2e54d109839d030 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:46 -0700 Subject: selftests: ublk: add integrity params test Add test case null_04 to exercise all the different integrity params. It creates 4 different ublk devices with different combinations of integrity arguments and verifies their integrity limits via sysfs and the metadata_size utility. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 10 ++ tools/testing/selftests/ublk/test_null_04.sh | 166 +++++++++++++++++++++++++++ 3 files changed, 177 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_null_04.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 351ac6438561..239ad1c741ef 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -27,6 +27,7 @@ TEST_PROGS += test_generic_15.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh +TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ea9a5f3eb70a..7ff6ce79d62c 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -384,6 +384,16 @@ _ublk_test_top_dir() cd "$(dirname "$0")" && pwd } +METADATA_SIZE_PROG="$(_ublk_test_top_dir)/metadata_size" + +_get_metadata_size() +{ + local dev_id=$1 + local field=$2 + + "$METADATA_SIZE_PROG" "/dev/ublkb$dev_id" | grep "$field" | grep -o "[0-9]*" +} + UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh new file mode 100755 index 000000000000..0b0719ea33a3 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID=null_04 + +_prep_test "null" "integrity params" + +dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 8 ]; then + echo "metadata_size $metadata_size != 8" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 0 ]; then + echo "pi_tuple_size $pi_tuple_size != 0" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != nop ]; then + echo "format $format != nop" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 64 ]; then + echo "metadata_size $metadata_size != 64" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 56 ]; then + echo "pi_offset $pi_offset != 56" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 8 ]; then + echo "pi_tuple_size $pi_tuple_size != 8" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 1 ]; then + echo "device_is_integrity_capable $capable != 1" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != T10-DIF-TYPE3-IP ]; then + echo "format $format != T10-DIF-TYPE3-IP" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 8 ]; then + echo "metadata_size $metadata_size != 8" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 8 ]; then + echo "pi_tuple_size $pi_tuple_size != 8" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != T10-DIF-TYPE1-CRC ]; then + echo "format $format != T10-DIF-TYPE1-CRC" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 16 ]; then + echo "metadata_size $metadata_size != 16" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 16 ]; then + echo "pi_tuple_size $pi_tuple_size != 16" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then + echo "format $format != EXT-DIF-TYPE3-CRC64" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 8 ]; then + echo "tag_size $tag_size != 8" + _show_result $TID 255 +fi +_cleanup_test + +_show_result $TID 0 -- cgit v1.2.3 From 78796b6bae8684b753b658f431b5b1ee24300d64 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:47 -0700 Subject: selftests: ublk: add end-to-end integrity test Add test case loop_08 to verify the ublk integrity data flow. It uses the kublk loop target to create a ublk device with integrity on top of backing data and integrity files. It then writes to the whole device with fio configured to generate integrity data. Then it reads back the whole device with fio configured to verify the integrity data. It also verifies that injected guard, reftag, and apptag corruptions are correctly detected. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_loop_08.sh | 111 +++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_loop_08.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 239ad1c741ef..036a9f01b464 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -35,6 +35,7 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh +TEST_PROGS += test_loop_08.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh new file mode 100755 index 000000000000..ca289cfb2ad4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +TID=loop_08 + +_prep_test "loop" "end-to-end integrity" + +_create_backfile 0 256M +_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) +integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" +dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") +_check_add_dev $TID $? + +# 1M * (64 integrity bytes / 512 data bytes) = 128K +fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" +fio --name fill --rw randwrite $fio_args > /dev/null +err=$? +if [ $err != 0 ]; then + echo "fio fill failed" + _show_result $TID $err +fi + +fio --name verify --rw randread $fio_args > /dev/null +err=$? +if [ $err != 0 ]; then + echo "fio verify failed" + _show_result $TID $err +fi + +fio_err=$(mktemp fio_err_XXXXX) + +# Overwrite 4-byte reftag at offset 56 + 4 = 60 +dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" +dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args +err=$? +if [ $err != 0 ]; then + echo "dd corrupted_reftag failed" + rm -f "$fio_err" + _show_result $TID $err +fi +if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi +# Reset to 0 +dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args +err=$? +if [ $err != 0 ]; then + echo "dd restore corrupted_reftag failed" + rm -f "$fio_err" + _show_result $TID $err +fi + +dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" +dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args +err=$? +if [ $err != 0 ]; then + echo "dd corrupted_data failed" + rm -f "$fio_err" + _show_result $TID $err +fi +if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi + +if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi + +rm -f "$fio_err" + +_cleanup_test +_show_result $TID 0 -- cgit v1.2.3 From bc5e8e2fa2e28ef6c2a55ae294d04100d4b1bffe Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 5 Jan 2026 12:05:14 +0100 Subject: x86/xen: Drop xen_irq_ops Instead of having a pre-filled array xen_irq_ops for Xen PV paravirt functions, drop the array and assign each element individually. This is in preparation of reducing the paravirt include hell by splitting paravirt.h into multiple more fine grained header files, which will in turn require to split up the pv_ops vector as well. Dropping the pre-filled array makes life easier for objtool to detect missing initializers in multiple pv_ops_ arrays. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Boris Ostrovsky Link: https://patch.msgid.link/20260105110520.21356-16-jgross@suse.com --- arch/x86/xen/irq.c | 20 +++++++------------- tools/objtool/check.c | 1 - 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 39982f955cfe..d8678c3d3971 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -40,20 +40,14 @@ static void xen_halt(void) xen_safe_halt(); } -static const typeof(pv_ops) xen_irq_ops __initconst = { - .irq = { - /* Initial interrupt flag handling only called while interrupts off. */ - .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), - .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), - .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func), - - .safe_halt = xen_safe_halt, - .halt = xen_halt, - }, -}; - void __init xen_init_irq_ops(void) { - pv_ops.irq = xen_irq_ops.irq; + /* Initial interrupt flag handling only called while interrupts off. */ + pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0); + pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop); + pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(BUG_func); + pv_ops.irq.safe_halt = xen_safe_halt; + pv_ops.irq.halt = xen_halt; + x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3f7999317f4d..0c32a92dc693 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -571,7 +571,6 @@ static int init_pv_ops(struct objtool_file *file) static const char *pv_ops_tables[] = { "pv_ops", "xen_cpu_ops", - "xen_irq_ops", "xen_mmu_ops", NULL, }; -- cgit v1.2.3 From 2f8d489897ae7183b535b1881478b2c6b66d520b Mon Sep 17 00:00:00 2001 From: George Guo Date: Sat, 10 Jan 2026 00:12:14 +0800 Subject: sched_ext: Add error logging for dsq creation failures Add scx_bpf_error() calls when scx_bpf_create_dsq() fails in multiple schedulers to improve debuggability: - scx_central.bpf.c: central_init() - scx_flatcg.bpf.c: fcg_cgroup_init() and fcg_init() - scx_qmap.bpf.c: qmap_init() Signed-off-by: George Guo Signed-off-by: Tejun Heo --- tools/sched_ext/scx_central.bpf.c | 4 +++- tools/sched_ext/scx_flatcg.bpf.c | 4 +++- tools/sched_ext/scx_qmap.bpf.c | 8 ++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 55df8b798865..1c2376b75b5d 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -301,8 +301,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init) int ret; ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); - if (ret) + if (ret) { + scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret); return ret; + } timer = bpf_map_lookup_elem(¢ral_timer, &key); if (!timer) diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 43126858b8e4..c216480c3ee0 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -842,8 +842,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, * unlikely case that it breaks. */ ret = scx_bpf_create_dsq(cgid, -1); - if (ret) + if (ret) { + scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret); return ret; + } cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index df21fad0c438..d51d8c38f1cf 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -866,12 +866,16 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); - if (ret) + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); return ret; + } ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); - if (ret) + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret); return ret; + } timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) -- cgit v1.2.3 From e272628902c1c96731e2d9f62a7fc77767686eb0 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 7 Jan 2026 14:32:16 +0100 Subject: perf test stat tests: Fix for virtualized machines On s390 'perf test's 'perf stat tests', subtest test_hybrid fails for z/VM systems. The root cause is this statement: $(perf stat -a -- sleep 0.1 2>&1 |\ grep -E "/cpu-cycles/[uH]*| cpu-cycles[:uH]* -c) The 'perf stat' output on a s390 z/VM system is # perf stat -a -- sleep 0.1 2>&1 Performance counter stats for 'system wide': 56 context-switches # 46.3 cs/sec cs_per_second 1,210.41 msec cpu-clock # 11.9 CPUs CPUs_utilized 12 cpu-migrations # 9.9 migrations/sec ... 81 page-faults # 66.9 faults/sec ... 0.100891009 seconds time elapsed The grep command does not match any single line and exits with error code 1. As the bash script is executed with 'set -e', it aborts with the first error code being non-zero. Fix this and use 'wc -l' to count matching lines instead of 'grep ... -c'. Output before: # perf test 102 102: perf stat tests : FAILED! # Output after: # perf test 102 102: perf stat tests : Ok # Fixes: bb6e7cb11d97ce19 ("perf tools: Add fallback for exclude_guest") Reviewed-by: Ian Rogers Reviewed-by: James Clark Signed-off-by: Thomas Richter Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Jan Polensky Cc: linux-s390@vger.kernel.org Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Vasily Gorbik Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh index 0b2f0f88ca16..792a0b79f6b8 100755 --- a/tools/perf/tests/shell/stat.sh +++ b/tools/perf/tests/shell/stat.sh @@ -233,7 +233,7 @@ test_hybrid() { fi # Run default Perf stat - cycles_events=$(perf stat -a -- sleep 0.1 2>&1 | grep -E "/cpu-cycles/[uH]*| cpu-cycles[:uH]* " -c) + cycles_events=$(perf stat -a -- sleep 0.1 2>&1 | grep -E "/cpu-cycles/[uH]*| cpu-cycles[:uH]* " | wc -l) # The expectation is that default output will have a cycles events on each # hybrid PMU. In situations with no cycles PMU events, like virtualized, this -- cgit v1.2.3 From 383f8e26e2c483e25453f8c3d0839877708ac701 Mon Sep 17 00:00:00 2001 From: Nicolas Schier Date: Thu, 8 Jan 2026 12:29:10 +0100 Subject: perf build: Raise minimum shellcheck version to 0.7.2 Raise the minimum shellcheck version for perf builds to 0.7.2, so that systems with shellcheck versions below 0.7.2 will automatically skip the shell script checking, even if NO_SHELLCHECK is unset. Since commit 241f21be7d0fdf3c ("perf test perftool_testsuite: Use absolute paths"), shellcheck versions before 0.7.2 break the perf build with several SC1090 [2] warnings due to its too strict dynamic source handling [1], e.g.: In tests/shell/base_probe/test_line_semantics.sh line 20: . "$DIR_PATH/../common/init.sh" ^---------------------------^ SC1090: Can't follow non-constant source. Use a directive to specify location. Fixes: 241f21be7d0fdf3c ("perf test perftool_testsuite: Use absolute paths") Signed-off-by: Nicolas Schier Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jakub Brnak Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Petlan Cc: Nicolas Schier Cc: Peter Zijlstra Cc: Philipp Hahn Cc: Veronika Molnarova Link: https://github.com/koalaman/shellcheck/issues/1998 # [1] Link: https://www.shellcheck.net/wiki/SC1090 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index b3f481a626af..e6895626c187 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -251,11 +251,12 @@ else endif # shellcheck is using in tools/perf/tests/Build with option -a/--check-sourced ( -# introduced in v0.4.7) and -S/--severity (introduced in v0.6.0). So make the -# minimal shellcheck version as v0.6.0. +# introduced in v0.4.7) and -S/--severity (introduced in v0.6.0) as well as +# dynamic source inclusions (properly handled since v0.7.2). +# So make the minimal shellcheck version as v0.7.2. ifneq ($(SHELLCHECK),) ifeq ($(shell expr $(shell $(SHELLCHECK) --version | grep version: | \ - sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 060), 1) + sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 072), 1) SHELLCHECK := else SHELLCHECK := $(SHELLCHECK) -s bash -a -S warning -- cgit v1.2.3 From 817f66e39e39b914aac25065a34f4462ab45ed26 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 5 Jan 2026 12:05:15 +0100 Subject: x86/xen: Drop xen_cpu_ops Instead of having a pre-filled array xen_cpu_ops for Xen PV paravirt functions, drop the array and assign each element individually. This is in preparation of reducing the paravirt include hell by splitting paravirt.h into multiple more fine grained header files, which will in turn require to split up the pv_ops vector as well. Dropping the pre-filled array makes life easier for objtool to detect missing initializers in multiple pv_ops_ arrays. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Boris Ostrovsky Link: https://patch.msgid.link/20260105110520.21356-17-jgross@suse.com --- arch/x86/xen/enlighten_pv.c | 82 ++++++++++++++++++--------------------------- tools/objtool/check.c | 1 - 2 files changed, 33 insertions(+), 50 deletions(-) (limited to 'tools') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index b74ff8bc7f2a..8a19a88190ee 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1212,54 +1212,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const typeof(pv_ops) xen_cpu_ops __initconst = { - .cpu = { - .cpuid = xen_cpuid, - - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, - - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, - - .write_cr4 = xen_write_cr4, - - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, - - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, - - .read_pmc = xen_read_pmc, - - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, - .load_gs_index = xen_load_gs_index, - - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, - - .store_tr = xen_store_tr, - - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, - -#ifdef CONFIG_X86_IOPL_IOPERM - .invalidate_io_bitmap = xen_invalidate_io_bitmap, - .update_io_bitmap = xen_update_io_bitmap, -#endif - .io_delay = xen_io_delay, - - .start_context_switch = xen_start_context_switch, - .end_context_switch = xen_end_context_switch, - }, -}; - static void xen_restart(char *msg) { xen_reboot(SHUTDOWN_reboot); @@ -1411,7 +1363,39 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.cpu = xen_cpu_ops.cpu; + + pv_ops.cpu.cpuid = xen_cpuid; + pv_ops.cpu.set_debugreg = xen_set_debugreg; + pv_ops.cpu.get_debugreg = xen_get_debugreg; + pv_ops.cpu.read_cr0 = xen_read_cr0; + pv_ops.cpu.write_cr0 = xen_write_cr0; + pv_ops.cpu.write_cr4 = xen_write_cr4; + pv_ops.cpu.read_msr = xen_read_msr; + pv_ops.cpu.write_msr = xen_write_msr; + pv_ops.cpu.read_msr_safe = xen_read_msr_safe; + pv_ops.cpu.write_msr_safe = xen_write_msr_safe; + pv_ops.cpu.read_pmc = xen_read_pmc; + pv_ops.cpu.load_tr_desc = paravirt_nop; + pv_ops.cpu.set_ldt = xen_set_ldt; + pv_ops.cpu.load_gdt = xen_load_gdt; + pv_ops.cpu.load_idt = xen_load_idt; + pv_ops.cpu.load_tls = xen_load_tls; + pv_ops.cpu.load_gs_index = xen_load_gs_index; + pv_ops.cpu.alloc_ldt = xen_alloc_ldt; + pv_ops.cpu.free_ldt = xen_free_ldt; + pv_ops.cpu.store_tr = xen_store_tr; + pv_ops.cpu.write_ldt_entry = xen_write_ldt_entry; + pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; + pv_ops.cpu.write_idt_entry = xen_write_idt_entry; + pv_ops.cpu.load_sp0 = xen_load_sp0; +#ifdef CONFIG_X86_IOPL_IOPERM + pv_ops.cpu.invalidate_io_bitmap = xen_invalidate_io_bitmap; + pv_ops.cpu.update_io_bitmap = xen_update_io_bitmap; +#endif + pv_ops.cpu.io_delay = xen_io_delay; + pv_ops.cpu.start_context_switch = xen_start_context_switch; + pv_ops.cpu.end_context_switch = xen_end_context_switch; + xen_init_irq_ops(); /* diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0c32a92dc693..8ab88f2b2c1b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -570,7 +570,6 @@ static int init_pv_ops(struct objtool_file *file) { static const char *pv_ops_tables[] = { "pv_ops", - "xen_cpu_ops", "xen_mmu_ops", NULL, }; -- cgit v1.2.3 From 6e5f2ad6bb74fd743c2162e32ac15e9061591ab1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Dec 2025 09:36:10 -0800 Subject: perf stat display: Make %f precision consistent Commit bc22de9bcdb22491 ("perf stat: Display time in precision based on std deviation") added multirun workload elapsed time. There was an effort to make the precision in the output most useful for the user, however, when gathering over runs it means the formatting varies. This change just makes the output format fixed. Before: ``` $ while :; do perf stat --null --repeat 3 sleep 0.1 2>&1 | grep elapsed; done 0.101140 +- 0.000149 seconds time elapsed ( +- 0.15% ) 0.1011396 +- 0.0000218 seconds time elapsed ( +- 0.02% ) 0.101331 +- 0.000124 seconds time elapsed ( +- 0.12% ) ^C $ while :; do perf stat --null --repeat 3 sleep 1 2>&1 | grep elapsed; done 1.001317 +- 0.000146 seconds time elapsed ( +- 0.01% ) 1.001377 +- 0.000172 seconds time elapsed ( +- 0.02% ) 1.00253 +- 0.00131 seconds time elapsed ( +- 0.13% ) ``` After: ``` $ while :; do perf stat --null --repeat 3 sleep 0.1 2>&1 | grep elapsed; done 0.101406408 +- 0.000064778 seconds time elapsed ( +- 0.06% ) 0.101367315 +- 0.000027253 seconds time elapsed ( +- 0.03% ) 0.101434164 +- 0.000084750 seconds time elapsed ( +- 0.08% ) ^C $ while :; do perf stat --null --repeat 3 sleep 1 2>&1 | grep elapsed; done 1.001525467 +- 0.000051703 seconds time elapsed ( +- 0.01% ) 1.001375093 +- 0.000116200 seconds time elapsed ( +- 0.01% ) 1.001141025 +- 0.000046361 seconds time elapsed ( +- 0.00% ) ``` Closes: https://lore.kernel.org/lkml/aTQRgAOpKyI53TEq@gmail.com/ Suggested-by: Ingo Molnar Signed-off-by: Ian Rogers Tested-by: Ingo Molnar Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Chun-Tse Shao Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 6d02f84c5691..2ce0602974a1 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -1397,21 +1397,12 @@ static void print_header(struct perf_stat_config *config, num_print_iv = 0; } -static int get_precision(double num) -{ - if (num > 1) - return 0; - - return lround(ceil(-log10(num))); -} - -static void print_table(struct perf_stat_config *config, - FILE *output, int precision, double avg) +static void print_table(struct perf_stat_config *config, FILE *output, double avg) { char tmp[64]; int idx, indent = 0; - scnprintf(tmp, 64, " %17.*f", precision, avg); + scnprintf(tmp, 64, " %17.9f", avg); while (tmp[indent] == ' ') indent++; @@ -1421,8 +1412,7 @@ static void print_table(struct perf_stat_config *config, double run = (double) config->walltime_run[idx] / NSEC_PER_SEC; int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5); - fprintf(output, " %17.*f (%+.*f) ", - precision, run, precision, run - avg); + fprintf(output, " %17.9f (%+.9f) ", run, run - avg); for (h = 0; h < n; h++) fprintf(output, "#"); @@ -1462,17 +1452,11 @@ static void print_footer(struct perf_stat_config *config) } } else { double sd = stddev_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC; - /* - * Display at most 2 more significant - * digits than the stddev inaccuracy. - */ - int precision = get_precision(sd) + 2; if (config->walltime_run_table) - print_table(config, output, precision, avg); + print_table(config, output, avg); - fprintf(output, " %17.*f +- %.*f seconds time elapsed", - precision, avg, precision, sd); + fprintf(output, " %17.9f +- %.9f seconds time elapsed", avg, sd); print_noise_pct(config, NULL, sd, avg, /*before_metric=*/false); } -- cgit v1.2.3 From ef92c4351ec75bcfb8a1cc3a88109b5339f296ef Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 8 Dec 2025 09:23:39 -0800 Subject: perf test subcmd help: Add exclude disjoint subcmd names The test is based on an error/fix posted to linux-perf-users. Reported-by: Sri Jayaramappa Reviewed-by: Sri Jayaramappa Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Guilherme Amadio Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Closes: https://lore.kernel.org/linux-perf-users/20251202213632.2873731-1-sjayaram@akamai.com/ Closes: https://urldefense.com/v3/__https://lore.kernel.org/linux-perf-users/20251202213632.2873731-1-sjayaram@akamai.com/__;!!GjvTz_vk!XehekKNUE4Ib_tvqIH6PMIIhly4X3BZ-Y40RC1HKMQ-6OdYEFvUPQhyWv_gk9vsRRN4_RcOLS2Bh0CQ$ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/subcmd-help.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/subcmd-help.c b/tools/perf/tests/subcmd-help.c index 2280b4c0e5e7..9da96a16fd20 100644 --- a/tools/perf/tests/subcmd-help.c +++ b/tools/perf/tests/subcmd-help.c @@ -95,10 +95,36 @@ static int test__exclude_cmdnames(struct test_suite *test __maybe_unused, return TEST_OK; } +static int test__exclude_cmdnames_no_overlap(struct test_suite *test __maybe_unused, + int subtest __maybe_unused) +{ + struct cmdnames cmds1 = {}; + struct cmdnames cmds2 = {}; + + add_cmdname(&cmds1, "read-vdso32", 11); + add_cmdname(&cmds2, "archive", 7); + + TEST_ASSERT_VAL("invalid original size", cmds1.cnt == 1); + TEST_ASSERT_VAL("invalid original size", cmds2.cnt == 1); + + exclude_cmds(&cmds1, &cmds2); + + TEST_ASSERT_VAL("invalid excluded size", cmds1.cnt == 1); + TEST_ASSERT_VAL("invalid excluded size", cmds2.cnt == 1); + + TEST_ASSERT_VAL("cannot find cmd", is_in_cmdlist(&cmds1, "read-vdso32") == 1); + TEST_ASSERT_VAL("wrong cmd", is_in_cmdlist(&cmds1, "archive") == 0); + + clean_cmdnames(&cmds1); + clean_cmdnames(&cmds2); + return TEST_OK; +} + static struct test_case tests__subcmd_help[] = { TEST_CASE("Load subcmd names", load_cmdnames), TEST_CASE("Uniquify subcmd names", uniq_cmdnames), TEST_CASE("Exclude duplicate subcmd names", exclude_cmdnames), + TEST_CASE("Exclude disjoint subcmd names", exclude_cmdnames_no_overlap), { .name = NULL, } }; -- cgit v1.2.3 From 2a3602030d800b6600ef55c31e21bc54611f7770 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:20 -0500 Subject: cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict Currently, when setting a cpuset's cpuset.cpus to a value that conflicts with the cpuset.cpus/cpuset.cpus.exclusive of a sibling partition, the sibling's partition state becomes invalid. This is overly harsh and is probably not necessary. The cpuset.cpus.exclusive control file, if set, will override the cpuset.cpus of the same cpuset when creating a cpuset partition. So cpuset.cpus has less priority than cpuset.cpus.exclusive in setting up a partition. However, it cannot override a conflicting cpuset.cpus file in a sibling cpuset and the partition creation process will fail. This is inconsistent. That will also make using cpuset.cpus.exclusive less valuable as a tool to set up cpuset partitions as the users have to check if such a cpuset.cpus conflict exists or not. Fix these problems by making sure that once a cpuset.cpus.exclusive is set without failure, it will always be allowed to form a valid partition as long as at least one CPU can be granted from its parent irrespective of the state of the siblings' cpuset.cpus values. Of course, setting cpuset.cpus.exclusive will fail if it conflicts with the cpuset.cpus.exclusive or the cpuset.cpus.exclusive.effective value of a sibling. Partition can still be created by setting only cpuset.cpus without setting cpuset.cpus.exclusive. However, any conflicting CPUs in sibling's cpuset.cpus.exclusive.effective and cpuset.cpus.exclusive values will be removed from its cpuset.cpus.exclusive.effective as long as there is still one or more CPUs left and can be granted from its parent. This CPU stripping is currently done in rm_siblings_excl_cpus(). The new code will now try its best to enable the creation of new partitions with only cpuset.cpus set without invalidating existing ones. However it is not guaranteed that all the CPUs requested in cpuset.cpus will be used in the new partition even when all these CPUs can be granted from the parent. This is similar to the fact that cpuset.cpus.effective may not be able to include all the CPUs requested in cpuset.cpus. In this case, the parent may not able to grant all the exclusive CPUs requested in cpuset.cpus to cpuset.cpus.exclusive.effective if some of them have already been granted to other partitions earlier. With the creation of multiple sibling partitions by setting only cpuset.cpus, this does have the side effect that their exact cpuset.cpus.exclusive.effective settings will depend on the order of partition creation if there are conflicts. Due to the exclusive nature of the CPUs in a partition, it is not easy to make it fair other than the old behavior of invalidating all the conflicting partitions. For example, # echo "0-2" > A1/cpuset.cpus # echo "root" > A1/cpuset.cpus.partition # cat A1/cpuset.cpus.partition root # cat A1/cpuset.cpus.exclusive.effective 0-2 # echo "2-4" > B1/cpuset.cpus # echo "root" > B1/cpuset.cpus.partition # cat B1/cpuset.cpus.partition root # cat B1/cpuset.cpus.exclusive.effective 3-4 # cat B1/cpuset.cpus.effective 3-4 For users who want to be sure that they can get most of the CPUs they want, cpuset.cpus.exclusive should be used instead if they can set it successfully without failure. Setting cpuset.cpus.exclusive will guarantee that sibling conflicts from then onward is no longer possible. To make this change, we have to separate out the is_cpu_exclusive() check in cpus_excl_conflict() into a cgroup v1 only cpuset1_cpus_excl_conflict() helper. The cpus_allowed_validate_change() helper is now no longer needed and can be removed. Some existing tests in test_cpuset_prs.sh are updated and new ones are added to reflect the new behavior. The cgroup-v2.rst doc file is also updated the clarify what exclusive CPUs will be used when a partition is created. Reported-by: Sun Shaojie Closes: https://lore.kernel.org/lkml/20251117015708.977585-1-sunshaojie@kylinos.cn/ Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 33 +++++++--- kernel/cgroup/cpuset-internal.h | 3 + kernel/cgroup/cpuset-v1.c | 19 ++++++ kernel/cgroup/cpuset.c | 80 ++++++++--------------- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 26 ++++++-- 5 files changed, 90 insertions(+), 71 deletions(-) (limited to 'tools') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 510df2461aff..28613c0e1c90 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2584,9 +2584,9 @@ Cpuset Interface Files of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" - if it is set. If "cpuset.cpus.exclusive" is not set, it is - treated to have an implicit value of "cpuset.cpus" in the - formation of local partition. + if it is set. This file should only be non-empty if either + "cpuset.cpus.exclusive" is set or when the current cpuset is + a valid partition root. cpuset.cpus.isolated A read-only and root cgroup only multiple values file. @@ -2618,13 +2618,22 @@ Cpuset Interface Files There are two types of partitions - local and remote. A local partition is one whose parent cgroup is also a valid partition root. A remote partition is one whose parent cgroup is not a - valid partition root itself. Writing to "cpuset.cpus.exclusive" - is optional for the creation of a local partition as its - "cpuset.cpus.exclusive" file will assume an implicit value that - is the same as "cpuset.cpus" if it is not set. Writing the - proper "cpuset.cpus.exclusive" values down the cgroup hierarchy - before the target partition root is mandatory for the creation - of a remote partition. + valid partition root itself. + + Writing to "cpuset.cpus.exclusive" is optional for the creation + of a local partition as its "cpuset.cpus.exclusive" file will + assume an implicit value that is the same as "cpuset.cpus" if it + is not set. Writing the proper "cpuset.cpus.exclusive" values + down the cgroup hierarchy before the target partition root is + mandatory for the creation of a remote partition. + + Not all the CPUs requested in "cpuset.cpus.exclusive" can be + used to form a new partition. Only those that were present + in its parent's "cpuset.cpus.exclusive.effective" control + file can be used. For partitions created without setting + "cpuset.cpus.exclusive", exclusive CPUs specified in sibling's + "cpuset.cpus.exclusive" or "cpuset.cpus.exclusive.effective" + also cannot be used. Currently, a remote partition cannot be created under a local partition. All the ancestors of a remote partition root except @@ -2632,6 +2641,10 @@ Cpuset Interface Files The root cgroup is always a partition root and its state cannot be changed. All other non-root cgroups start out as "member". + Even though the "cpuset.cpus.exclusive*" and "cpuset.cpus" + control files are not present in the root cgroup, they are + implicitly the same as the "/sys/devices/system/cpu/possible" + sysfs file. When set to "root", the current cgroup is the root of a new partition or scheduling domain. The set of exclusive CPUs is diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index e718a4f54360..e8e2683cb067 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -312,6 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); int cpuset1_generate_sched_domains(cpumask_var_t **domains, @@ -326,6 +327,8 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index ecfea7800f0d..04124c38a774 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -373,6 +373,25 @@ out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4819ab429771..83fb83a86b4b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -129,6 +129,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -616,27 +627,25 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of a sibling cpuset cannot be a subset of the new exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(trial) || is_cpu_exclusive(sibling)) - return !cpusets_are_exclusive(trial, sibling); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -2312,43 +2321,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2408,15 +2380,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2439,7 +2411,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index a17256d9f88a..ff4540b0490e 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -269,7 +269,7 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3|A2:1-3|A3:2-3|B1:2-3 A1:P0|A3:P0|B1:P-2" + " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-1|A2:1|A3:1|B1:2-3 A1:P0|A3:P0|B1:P2" " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" @@ -318,7 +318,7 @@ TEST_MATRIX=( # Invalid to valid local partition direct transition tests " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3" " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3" - " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:4-6 A1:P-2|B1:P0" + " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:5-6 A1:P2|B1:P0" " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3" # Local partition invalidation tests @@ -388,10 +388,10 @@ TEST_MATRIX=( " C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2" " C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1" - # A non-exclusive cpuset.cpus change will invalidate partition and its siblings - " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P0" - " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P-1" - " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P0|B1:P-1" + # A non-exclusive cpuset.cpus change will not invalidate its siblings partition. + " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:3 A1:P1|B1:P0" + " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-1|XA1:0-1|B1:2-3 A1:P1|B1:P1" + " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-1|B1:2-3 A1:P0|B1:P1" # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it " C0-3 . . C4-5 X5 . . . 0 A1:0-3|B1:4-5" @@ -417,6 +417,14 @@ TEST_MATRIX=( " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \ A1:P0|A2:P2:B1:P-1 2-4" + # When multiple partitions with conflicting cpuset.cpus are created, the + # latter created ones will only get what are left of the available exclusive + # CPUs. + " C1-3:P1 . . . . . . C3-5:P1 0 A1:1-3|B1:4-5:XB1:4-5 A1:P1|B1:P1" + + # cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive + " C1-3:X1-3 . . C4-5 . . . C1-2 0 A1:1-3|B1:1-2" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -427,7 +435,7 @@ TEST_MATRIX=( # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3|B1:4-5" - # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + # cpuset.cpus.exclusive cannot be set to a superset of sibling's cpuset.cpus " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3|B1:4-5" ) @@ -477,6 +485,10 @@ REMOTE_TEST_MATRIX=( . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \ p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \ 1-2,4-6|1-2,5-6" + # c12 whose cpuset.cpus CPUs are all granted to c11 will become invalid partition + " C1-5:P1:S+ . C1-4:P1 C2-3 . . \ + . . . P1 . . p1:5|c11:1-4|c12:5 \ + p1:P1|c11:P1|c12:P-1" ) # -- cgit v1.2.3 From 272bd8183376a9e20fe08bacbaa44003d7c8acaa Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:21 -0500 Subject: cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change() As stated in commit 1c09b195d37f ("cpuset: fix a regression in validating config change"), it is not allowed to clear masks of a cpuset if there're tasks in it. This is specific to v1 since empty "cpuset.cpus" or "cpuset.mems" will cause the v2 cpuset to inherit the effective CPUs or memory nodes from its parent. So it is OK to have empty cpus or mems even if there are tasks in the cpuset. Move this empty cpus/mems check in validate_change() to cpuset1_validate_change() to allow more flexibility in setting cpus or mems in v2. cpuset_is_populated() needs to be moved into cpuset-internal.h as it is needed by the empty cpus/mems checking code. Also add a test case to test_cpuset_prs.sh to verify that. Reported-by: Chen Ridong Closes: https://lore.kernel.org/lkml/7a3ec392-2e86-4693-aa9f-1e668a668b9c@huaweicloud.com/ Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 9 +++++++++ kernel/cgroup/cpuset-v1.c | 14 ++++++++++++++ kernel/cgroup/cpuset.c | 23 ----------------------- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 3 +++ 4 files changed, 26 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index e8e2683cb067..fd7d19842ded 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -260,6 +260,15 @@ static inline int nr_cpusets(void) return static_key_count(&cpusets_enabled_key.key) + 1; } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 04124c38a774..7a23b9e8778f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -368,6 +368,20 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 83fb83a86b4b..a3dbca125588 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -370,15 +370,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -695,20 +686,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index ff4540b0490e..5dff3ad53867 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -425,6 +425,9 @@ TEST_MATRIX=( # cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive " C1-3:X1-3 . . C4-5 . . . C1-2 0 A1:1-3|B1:1-2" + # cpuset.cpus can become empty with task in it as it inherits parent's effective CPUs + " C1-3:S+ C2 . . . T:C . . 0 A1:1-3|A2:1-3" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: -- cgit v1.2.3 From f815fc0c66e777c727689666cfb46b8d461c2f99 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:32 -0800 Subject: perf unwind-libdw: Fix invalid reference counts The addition of addr_location__exit() causes use-after put on the maps and map references in the unwind info. Add the gets and then add the map_symbol__exit() calls. Fixes: 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy functions") Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/unwind-libdw.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index ae70fb56a057..3ff427a49e4c 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -136,8 +136,8 @@ static int entry(u64 ip, struct unwind_info *ui) } e->ip = ip; - e->ms.maps = al.maps; - e->ms.map = al.map; + e->ms.maps = maps__get(al.maps); + e->ms.map = map__get(al.map); e->ms.sym = al.sym; pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n", @@ -325,6 +325,9 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (err) pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1)); + for (i = 0; i < ui->idx; i++) + map_symbol__exit(&ui->entries[i].ms); + dwfl_end(ui->dwfl); free(ui); return 0; -- cgit v1.2.3 From 27fc6f565d06837e71001368c84ee71e5221ce48 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:37 -0800 Subject: perf test workload: Add inlineloop test workload The purpose of this workload is to gather samples in an inlined function. This can be used to test whether inlined addr2line works correctly. Committer testing: $ perf record perf test -w inlineloop 1 [ perf record: Woken up 2 times to write data ] [ perf record: Captured and wrote 0.161 MB perf.data (4005 samples) ] $ perf report --stdio --dso perf -s srcfile,srcline # # Total Lost Samples: 0 # # Samples: 4K of event 'cpu/cycles/Pu' # Event count (approx.): 5535180842 # # Overhead Source File Source:Line # ........ ............ ............... # 99.04% inlineloop.c inlineloop.c:21 0.46% inlineloop.c inlineloop.c:20 # $ Reviewed-by: James Clark Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 1 + tools/perf/tests/tests.h | 1 + tools/perf/tests/workloads/Build | 2 ++ tools/perf/tests/workloads/inlineloop.c | 52 +++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+) create mode 100644 tools/perf/tests/workloads/inlineloop.c (limited to 'tools') diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index bd6ffa8e4578..e2490652f030 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -153,6 +153,7 @@ static struct test_workload *workloads[] = { &workload__datasym, &workload__landlock, &workload__traploop, + &workload__inlineloop, }; #define workloads__for_each(workload) \ diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index cb67ddbd0375..1f0f8b267fb1 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -240,6 +240,7 @@ DECLARE_WORKLOAD(brstack); DECLARE_WORKLOAD(datasym); DECLARE_WORKLOAD(landlock); DECLARE_WORKLOAD(traploop); +DECLARE_WORKLOAD(inlineloop); extern const char *dso_to_test; extern const char *test_objdump_path; diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build index fb1012cc4fc3..866a00bd14a0 100644 --- a/tools/perf/tests/workloads/Build +++ b/tools/perf/tests/workloads/Build @@ -8,9 +8,11 @@ perf-test-y += brstack.o perf-test-y += datasym.o perf-test-y += landlock.o perf-test-y += traploop.o +perf-test-y += inlineloop.o CFLAGS_sqrtloop.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE CFLAGS_leafloop.o = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE CFLAGS_brstack.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE CFLAGS_datasym.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE CFLAGS_traploop.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE +CFLAGS_inlineloop.o = -g -O2 diff --git a/tools/perf/tests/workloads/inlineloop.c b/tools/perf/tests/workloads/inlineloop.c new file mode 100644 index 000000000000..bc82dfc7c410 --- /dev/null +++ b/tools/perf/tests/workloads/inlineloop.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "../tests.h" + +static volatile int a; +static volatile sig_atomic_t done; + +static void sighandler(int sig __maybe_unused) +{ + done = 1; +} + +static inline void __attribute__((always_inline)) leaf(int b) +{ +again: + a += b; + if (!done) + goto again; +} + +static inline void __attribute__((always_inline)) middle(int b) +{ + leaf(b); +} + +static noinline void parent(int b) +{ + middle(b); +} + +static int inlineloop(int argc, const char **argv) +{ + int sec = 1; + + pthread_setname_np(pthread_self(), "perf-inlineloop"); + if (argc > 0) + sec = atoi(argv[0]); + + signal(SIGINT, sighandler); + signal(SIGALRM, sighandler); + alarm(sec); + + parent(sec); + + return 0; +} + +DEFINE_WORKLOAD(inlineloop); -- cgit v1.2.3 From 88c51002d06f9a68a2b666f7e2c262b6e198f566 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:33 -0800 Subject: perf addr2line: Add a libdw implementation Add an implementation of addr2line that uses libdw. Other addr2line implementations are slow, particularly in the case of forking addr2line. Add an implementation that caches the libdw information in the dso and uses it to find the file and line number information. Inline information is supported but because cu_walk_functions_at visits the leaf function last add a inline_list__append_tail to reverse the lists order. Committer testing: # perf probe -x ~/bin/perf libdw__addr2line Added new event: probe_perf:libdw_addr2line (on libdw__addr2line in /home/acme/bin/perf) You can now use it in all perf tools, such as: perf record -e probe_perf:libdw_addr2line -aR sleep 1 # # perf stat -e probe_perf:libdw_addr2line perf report -f --dso perf --stdio -s srcfile,srcline # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4K of event 'cpu/cycles/Pu' # Event count (approx.): 5535180842 # # Overhead Source File Source:Line # ........ ............ ............... # 99.04% inlineloop.c inlineloop.c:21 0.46% inlineloop.c inlineloop.c:20 # # (Tip: For tracepoint events, try: perf report -s trace_fields) # Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline': 44 probe_perf:libdw_addr2line 0.037260744 seconds time elapsed 0.025299000 seconds user 0.011918000 seconds sys # Adding probes to the other addr2line implementations (llvm__addr2line, libbfd__addr2line and cmd__addr2line) I noticed some fallbacks to the llvm one: Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline': 44 probe_perf:libdw_addr2line 23 probe_perf:llvm_addr2line 0 probe_perf:libbfd_addr2line 0 probe_perf:cmd_addr2line Something to investigate further, but at least we don't fallback to the cmd based one :-) Reviewed-by: James Clark Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/dso.c | 2 + tools/perf/util/dso.h | 11 ++++ tools/perf/util/libdw.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/libdw.h | 60 ++++++++++++++++++ tools/perf/util/srcline.c | 24 ++++++++ tools/perf/util/srcline.h | 1 + 7 files changed, 252 insertions(+) create mode 100644 tools/perf/util/libdw.c create mode 100644 tools/perf/util/libdw.h (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 1c2a43e1dc68..2bed6274e248 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -224,6 +224,7 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o perf-util-$(CONFIG_LIBDW) += debuginfo.o perf-util-$(CONFIG_LIBDW) += annotate-data.o +perf-util-$(CONFIG_LIBDW) += libdw.o perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind-local.o diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 344e689567ee..06980844c014 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -32,6 +32,7 @@ #include "string2.h" #include "vdso.h" #include "annotate-data.h" +#include "libdw.h" static const char * const debuglink_paths[] = { "%.0s%s", @@ -1605,6 +1606,7 @@ void dso__delete(struct dso *dso) auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache); dso_cache__free(dso); dso__free_a2l(dso); + dso__free_a2l_libdw(dso); dso__free_symsrc_filename(dso); nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo); mutex_destroy(dso__lock(dso)); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index f8ccb9816b89..4aee23775054 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -268,6 +268,7 @@ DECLARE_RC_STRUCT(dso) { const char *short_name; const char *long_name; void *a2l; + void *a2l_libdw; char *symsrc_filename; #if defined(__powerpc__) void *dwfl; /* DWARF debug info */ @@ -334,6 +335,16 @@ static inline void dso__set_a2l(struct dso *dso, void *val) RC_CHK_ACCESS(dso)->a2l = val; } +static inline void *dso__a2l_libdw(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->a2l_libdw; +} + +static inline void dso__set_a2l_libdw(struct dso *dso, void *val) +{ + RC_CHK_ACCESS(dso)->a2l_libdw = val; +} + static inline unsigned int dso__a2l_fails(const struct dso *dso) { return RC_CHK_ACCESS(dso)->a2l_fails; diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c new file mode 100644 index 000000000000..e4bfd52bd172 --- /dev/null +++ b/tools/perf/util/libdw.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "dso.h" +#include "libdw.h" +#include "srcline.h" +#include "symbol.h" +#include "dwarf-aux.h" +#include +#include +#include + +void dso__free_a2l_libdw(struct dso *dso) +{ + Dwfl *dwfl = dso__a2l_libdw(dso); + + if (dwfl) { + dwfl_end(dwfl); + dso__set_a2l_libdw(dso, NULL); + } +} + +struct libdw_a2l_cb_args { + struct dso *dso; + struct symbol *sym; + struct inline_node *node; + char *leaf_srcline; + bool leaf_srcline_used; +}; + +static int libdw_a2l_cb(Dwarf_Die *die, void *_args) +{ + struct libdw_a2l_cb_args *args = _args; + struct symbol *inline_sym = new_inline_sym(args->dso, args->sym, dwarf_diename(die)); + const char *call_fname = die_get_call_file(die); + char *call_srcline = srcline__unknown; + struct inline_list *ilist; + + if (!inline_sym) + return -ENOMEM; + + /* Assign caller information to the parent. */ + if (call_fname) + call_srcline = srcline_from_fileline(call_fname, die_get_call_lineno(die)); + + list_for_each_entry(ilist, &args->node->val, list) { + ilist->srcline = call_srcline; + call_srcline = NULL; + break; + } + if (call_srcline && call_fname) + free(call_srcline); + + /* Add this symbol to the chain as the leaf. */ + inline_list__append_tail(inline_sym, args->leaf_srcline, args->node); + args->leaf_srcline_used = true; + return 0; +} + +int libdw__addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line_nr, + struct dso *dso, bool unwind_inlines, + struct inline_node *node, struct symbol *sym) +{ + static const Dwfl_Callbacks offline_callbacks = { + .find_debuginfo = dwfl_standard_find_debuginfo, + .section_address = dwfl_offline_section_address, + .find_elf = dwfl_build_id_find_elf, + }; + Dwfl *dwfl = dso__a2l_libdw(dso); + Dwfl_Module *mod; + Dwfl_Line *dwline; + Dwarf_Addr bias; + const char *src; + int lineno = 0; + + if (!dwfl) { + /* + * Initialize Dwfl session. + * We need to open the DSO file to report it to libdw. + */ + int fd; + + fd = open(dso_name, O_RDONLY); + if (fd < 0) + return 0; + + dwfl = dwfl_begin(&offline_callbacks); + if (!dwfl) { + close(fd); + return 0; + } + + /* + * If the report is successful, the file descriptor fd is consumed + * and closed by the Dwfl. If not, it is not closed. + */ + mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd); + if (!mod) { + dwfl_end(dwfl); + close(fd); + return 0; + } + + dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL); + dso__set_a2l_libdw(dso, dwfl); + } else { + /* Dwfl session already initialized, get module for address. */ + mod = dwfl_addrmodule(dwfl, addr); + } + + if (!mod) + return 0; + + /* + * Get/ignore the dwarf information. Determine the bias, difference + * between the regular ELF addr2line addresses and those to use with + * libdw. + */ + if (!dwfl_module_getdwarf(mod, &bias)) + return 0; + + /* Find source line information for the address. */ + dwline = dwfl_module_getsrc(mod, addr + bias); + if (!dwline) + return 0; + + /* Get line information. */ + src = dwfl_lineinfo(dwline, /*addr=*/NULL, &lineno, /*col=*/NULL, /*mtime=*/NULL, + /*length=*/NULL); + + if (file) + *file = src ? strdup(src) : NULL; + if (line_nr) + *line_nr = lineno; + + /* Optionally unwind inline function call chain. */ + if (unwind_inlines && node) { + Dwarf_Addr unused_bias; + Dwarf_Die *cudie = dwfl_module_addrdie(mod, addr + bias, &unused_bias); + struct libdw_a2l_cb_args args = { + .dso = dso, + .sym = sym, + .node = node, + .leaf_srcline = srcline_from_fileline(src ?: "", lineno), + }; + + /* Walk from the parent down to the leaf. */ + cu_walk_functions_at(cudie, addr, libdw_a2l_cb, &args); + + if (!args.leaf_srcline_used) + free(args.leaf_srcline); + } + return 1; +} diff --git a/tools/perf/util/libdw.h b/tools/perf/util/libdw.h new file mode 100644 index 000000000000..0f8d7b4a11a5 --- /dev/null +++ b/tools/perf/util/libdw.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef PERF_LIBDW_H +#define PERF_LIBDW_H + +#include + +struct dso; +struct inline_node; +struct symbol; + +#ifdef HAVE_LIBDW_SUPPORT +/* + * libdw__addr2line - Convert address to source location using libdw + * @dso_name: Name of the DSO + * @addr: Address to resolve + * @file: Pointer to return filename (caller must free) + * @line_nr: Pointer to return line number + * @dso: The dso struct + * @unwind_inlines: Whether to unwind inline function calls + * @node: Inline node list to append to + * @sym: The symbol associated with the address + * + * This function initializes a Dwfl context for the DSO if not already present, + * finds the source line information for the given address, and optionally + * resolves inline function call chains. + * + * Returns 1 on success (found), 0 on failure (not found). + */ +int libdw__addr2line(const char *dso_name, u64 addr, char **file, + unsigned int *line_nr, struct dso *dso, + bool unwind_inlines, struct inline_node *node, + struct symbol *sym); + +/* + * dso__free_a2l_libdw - Free libdw resources associated with the DSO + * @dso: The dso to free resources for + * + * This function cleans up the Dwfl context used for addr2line lookups. + */ +void dso__free_a2l_libdw(struct dso *dso); + +#else /* HAVE_LIBDW_SUPPORT */ + +static inline int libdw__addr2line(const char *dso_name __maybe_unused, + u64 addr __maybe_unused, char **file __maybe_unused, + unsigned int *line_nr __maybe_unused, + struct dso *dso __maybe_unused, + bool unwind_inlines __maybe_unused, + struct inline_node *node __maybe_unused, + struct symbol *sym __maybe_unused) +{ + return 0; +} + +static inline void dso__free_a2l_libdw(struct dso *dso __maybe_unused) +{ +} +#endif /* HAVE_LIBDW_SUPPORT */ + +#endif /* PERF_LIBDW_H */ diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 27c0966611ab..e2d280678b02 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -6,6 +6,7 @@ #include "libbfd.h" #include "llvm.h" #include "symbol.h" +#include "libdw.h" #include #include @@ -51,6 +52,25 @@ int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node return 0; } +int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline_node *node) +{ + struct inline_list *ilist; + + ilist = zalloc(sizeof(*ilist)); + if (ilist == NULL) + return -1; + + ilist->symbol = symbol; + ilist->srcline = srcline; + + if (callchain_param.order == ORDER_CALLEE) + list_add(&ilist->list, &node->val); + else + list_add_tail(&ilist->list, &node->val); + + return 0; +} + /* basename version that takes a const input string */ static const char *gnu_basename(const char *path) { @@ -120,6 +140,10 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int * { int ret; + ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); + if (ret > 0) + return ret; + ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); if (ret > 0) return ret; diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index c36f573cd339..be9f002bf234 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -57,6 +57,7 @@ struct inline_node *inlines__tree_find(struct rb_root_cached *tree, u64 addr); void inlines__tree_delete(struct rb_root_cached *tree); int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node *node); +int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline_node *node); char *srcline_from_fileline(const char *file, unsigned int line); struct symbol *new_inline_sym(struct dso *dso, struct symbol *base_sym, -- cgit v1.2.3 From ec9426655dcee3e337735935dcc2dea7684a5bf8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:34 -0800 Subject: perf addr2line.c: Rename a2l_style to cmd_a2l_style The a2l_style is only relevant to the command line version, so rename to make this clearer. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/addr2line.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c index f2d94a3272d7..0f1499350d47 100644 --- a/tools/perf/util/addr2line.c +++ b/tools/perf/util/addr2line.c @@ -90,16 +90,16 @@ static struct child_process *addr2line_subprocess_init(const char *addr2line_pat return a2l; } -enum a2l_style { +enum cmd_a2l_style { BROKEN, GNU_BINUTILS, LLVM, }; -static enum a2l_style addr2line_configure(struct child_process *a2l, const char *dso_name) +static enum cmd_a2l_style cmd_addr2line_configure(struct child_process *a2l, const char *dso_name) { static bool cached; - static enum a2l_style style; + static enum cmd_a2l_style style; if (!cached) { char buf[128]; @@ -149,7 +149,7 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char } static int read_addr2line_record(struct io *io, - enum a2l_style style, + enum cmd_a2l_style style, const char *dso_name, u64 addr, bool first, @@ -298,7 +298,7 @@ int cmd__addr2line(const char *dso_name, u64 addr, char buf[128]; ssize_t written; struct io io = { .eof = false }; - enum a2l_style a2l_style; + enum cmd_a2l_style cmd_a2l_style; if (!a2l) { if (!filename__has_section(dso_name, ".debug_line")) @@ -314,8 +314,8 @@ int cmd__addr2line(const char *dso_name, u64 addr, pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name); goto out; } - a2l_style = addr2line_configure(a2l, dso_name); - if (a2l_style == BROKEN) + cmd_a2l_style = cmd_addr2line_configure(a2l, dso_name); + if (cmd_a2l_style == BROKEN) goto out; /* @@ -336,7 +336,7 @@ int cmd__addr2line(const char *dso_name, u64 addr, } io__init(&io, a2l->out, buf, sizeof(buf)); io.timeout_ms = addr2line_timeout_ms; - switch (read_addr2line_record(&io, a2l_style, dso_name, addr, /*first=*/true, + switch (read_addr2line_record(&io, cmd_a2l_style, dso_name, addr, /*first=*/true, &record_function, &record_filename, &record_line_nr)) { case -1: if (!symbol_conf.disable_add2line_warn) @@ -351,7 +351,7 @@ int cmd__addr2line(const char *dso_name, u64 addr, * binutils, also force a non-zero address as we're no longer * reading that record. */ - switch (read_addr2line_record(&io, a2l_style, dso_name, + switch (read_addr2line_record(&io, cmd_a2l_style, dso_name, /*addr=*/1, /*first=*/true, NULL, NULL, NULL)) { case -1: @@ -397,7 +397,7 @@ int cmd__addr2line(const char *dso_name, u64 addr, * as we're reading records beyond the first. */ while ((record_status = read_addr2line_record(&io, - a2l_style, + cmd_a2l_style, dso_name, /*addr=*/1, /*first=*/false, -- cgit v1.2.3 From 7aef17f367c94d6cef00f45b193e37d30ff4a3b5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 5 Jan 2026 12:05:16 +0100 Subject: x86/xen: Drop xen_mmu_ops Instead of having a pre-filled array xen_mmu_ops for Xen PV paravirt functions, drop the array and assign each element individually. This is in preparation of reducing the paravirt include hell by splitting paravirt.h into multiple more fine grained header files, which will in turn require to split up the pv_ops vector as well. Dropping the pre-filled array makes life easier for objtool to detect missing initializers in multiple pv_ops_ arrays. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Boris Ostrovsky Link: https://patch.msgid.link/20260105110520.21356-18-jgross@suse.com --- arch/x86/xen/mmu_pv.c | 100 +++++++++++++++++++------------------------------- tools/objtool/check.c | 1 - 2 files changed, 38 insertions(+), 63 deletions(-) (limited to 'tools') diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2a4a8deaf612..9fa00c4a8858 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2175,73 +2175,49 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const typeof(pv_ops) xen_mmu_ops __initconst = { - .mmu = { - .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_one_user = xen_flush_tlb_one_user, - .flush_tlb_multi = xen_flush_tlb_multi, - - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, - - .set_pte = xen_set_pte_init, - .set_pmd = xen_set_pmd_hyper, - - .ptep_modify_prot_start = xen_ptep_modify_prot_start, - .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, - - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - - .set_pud = xen_set_pud_hyper, - - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, - - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, - - .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), - .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), - - .enter_mmap = xen_enter_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = xen_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = xen_flush_lazy_mmu, - }, - - .set_fixmap = xen_set_fixmap, - }, -}; - void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.hyper.init_after_bootmem = xen_after_bootmem; - pv_ops.mmu = xen_mmu_ops.mmu; + pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2); + pv_ops.mmu.write_cr2 = xen_write_cr2; + pv_ops.mmu.read_cr3 = xen_read_cr3; + pv_ops.mmu.write_cr3 = xen_write_cr3_init; + pv_ops.mmu.flush_tlb_user = xen_flush_tlb; + pv_ops.mmu.flush_tlb_kernel = xen_flush_tlb; + pv_ops.mmu.flush_tlb_one_user = xen_flush_tlb_one_user; + pv_ops.mmu.flush_tlb_multi = xen_flush_tlb_multi; + pv_ops.mmu.pgd_alloc = xen_pgd_alloc; + pv_ops.mmu.pgd_free = xen_pgd_free; + pv_ops.mmu.alloc_pte = xen_alloc_pte_init; + pv_ops.mmu.release_pte = xen_release_pte_init; + pv_ops.mmu.alloc_pmd = xen_alloc_pmd_init; + pv_ops.mmu.release_pmd = xen_release_pmd_init; + pv_ops.mmu.set_pte = xen_set_pte_init; + pv_ops.mmu.set_pmd = xen_set_pmd_hyper; + pv_ops.mmu.ptep_modify_prot_start = xen_ptep_modify_prot_start; + pv_ops.mmu.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + pv_ops.mmu.pte_val = PV_CALLEE_SAVE(xen_pte_val); + pv_ops.mmu.pgd_val = PV_CALLEE_SAVE(xen_pgd_val); + pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte_init); + pv_ops.mmu.make_pgd = PV_CALLEE_SAVE(xen_make_pgd); + pv_ops.mmu.set_pud = xen_set_pud_hyper; + pv_ops.mmu.make_pmd = PV_CALLEE_SAVE(xen_make_pmd); + pv_ops.mmu.pmd_val = PV_CALLEE_SAVE(xen_pmd_val); + pv_ops.mmu.pud_val = PV_CALLEE_SAVE(xen_pud_val); + pv_ops.mmu.make_pud = PV_CALLEE_SAVE(xen_make_pud); + pv_ops.mmu.set_p4d = xen_set_p4d_hyper; + pv_ops.mmu.alloc_pud = xen_alloc_pmd_init; + pv_ops.mmu.release_pud = xen_release_pmd_init; + pv_ops.mmu.p4d_val = PV_CALLEE_SAVE(xen_p4d_val); + pv_ops.mmu.make_p4d = PV_CALLEE_SAVE(xen_make_p4d); + pv_ops.mmu.enter_mmap = xen_enter_mmap; + pv_ops.mmu.exit_mmap = xen_exit_mmap; + pv_ops.mmu.lazy_mode.enter = xen_enter_lazy_mmu; + pv_ops.mmu.lazy_mode.leave = xen_leave_lazy_mmu; + pv_ops.mmu.lazy_mode.flush = xen_flush_lazy_mmu; + pv_ops.mmu.set_fixmap = xen_set_fixmap; memset(dummy_mapping, 0xff, PAGE_SIZE); } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 8ab88f2b2c1b..61f3e0c48fcc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -570,7 +570,6 @@ static int init_pv_ops(struct objtool_file *file) { static const char *pv_ops_tables[] = { "pv_ops", - "xen_mmu_ops", NULL, }; const char *pv_ops; -- cgit v1.2.3 From 21eb90fb5fbc939eb68262efc0e916293d8299d2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:36 -0800 Subject: tools: ynl: cli: introduce formatting for attr names in --list-attrs It's a little hard to make sense of the output of --list-attrs, it looks like a wall of text. Sprinkle a little bit of formatting - make op and attr names bold, and Enum: / Flags: keywords italics. Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 5fee45e48bbf..aa50d42e35ac 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -20,6 +20,29 @@ from lib import YnlFamily, Netlink, NlError, SpecFamily, SpecException, YnlExcep SYS_SCHEMA_DIR='/usr/share/ynl' RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink' +# pylint: disable=too-few-public-methods,too-many-locals +class Colors: + """ANSI color and font modifier codes""" + RESET = '\033[0m' + + BOLD = '\033[1m' + ITALICS = '\033[3m' + UNDERLINE = '\033[4m' + INVERT = '\033[7m' + + +def color(text, modifiers): + """Add color to text if output is a TTY + + Returns: + Colored text if stdout is a TTY, otherwise plain text + """ + if sys.stdout.isatty(): + # Join the colors if they are a list, if it's a string this a noop + modifiers = "".join(modifiers) + return f"{modifiers}{text}{Colors.RESET}" + return text + def schema_dir(): """ Return the effective schema directory, preferring in-tree before @@ -60,7 +83,7 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): for attr_name in attr_names: if attr_name in attr_set.attrs: attr = attr_set.attrs[attr_name] - attr_info = f'{prefix}- {attr_name}: {attr.type}' + attr_info = f'{prefix}- {color(attr_name, Colors.BOLD)}: {attr.type}' if 'enum' in attr.yaml: enum_name = attr.yaml['enum'] attr_info += f" (enum: {enum_name})" @@ -68,7 +91,8 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): if enum_name in ynl.consts: const = ynl.consts[enum_name] enum_values = list(const.entries.keys()) - attr_info += f"\n{prefix} {const.type.capitalize()}: {', '.join(enum_values)}" + type_fmted = color(const.type.capitalize(), Colors.ITALICS) + attr_info += f"\n{prefix} {type_fmted}: {', '.join(enum_values)}" # Show nested attributes reference and recursively display them nested_set_name = None @@ -226,7 +250,7 @@ def main(): print(f'Operation {args.list_attrs} not found') sys.exit(1) - print(f'Operation: {op.name}') + print(f'Operation: {color(op.name, Colors.BOLD)}') print(op.yaml['doc']) for mode in ['do', 'dump', 'event']: -- cgit v1.2.3 From 101a7d57d518c0c9e9eefc3768909ae02b96b3ef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:37 -0800 Subject: tools: ynl: cli: wrap the doc text if it's long We already use textwrap when printing "doc" section about an attribute, but only to indent the text. Switch to using fill() to split and indent all the lines. While at it indent the text by 2 more spaces, so that it doesn't align with the name of the attribute. Before (I'm drawing a "box" at ~60 cols here, in an attempt for clarity): | - irq-suspend-timeout: uint | | The timeout, in nanoseconds, of how long to suspend irq| |processing, if event polling finds events | After: | - irq-suspend-timeout: uint | | The timeout, in nanoseconds, of how long to suspend | | irq processing, if event polling finds events | Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index aa50d42e35ac..dc84619e5518 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -10,6 +10,7 @@ import json import os import pathlib import pprint +import shutil import sys import textwrap @@ -101,7 +102,11 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): attr_info += f" -> {nested_set_name}" if attr.yaml.get('doc'): - doc_text = textwrap.indent(attr.yaml['doc'], prefix + ' ') + doc_prefix = prefix + ' ' * 4 + term_width = shutil.get_terminal_size().columns + doc_text = textwrap.fill(attr.yaml['doc'], width=term_width, + initial_indent=doc_prefix, + subsequent_indent=doc_prefix) attr_info += f"\n{doc_text}" print(attr_info) -- cgit v1.2.3 From 1b7fbf62ad8b404e55d195021724067b5122b630 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:38 -0800 Subject: tools: ynl: cli: improve --help Improve the clarity of --help. Reorder, provide some grouping and add help messages to most of the options. No functional changes intended. Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 113 +++++++++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index dc84619e5518..3aa1f1e816bf 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -151,47 +151,78 @@ def main(): """ parser = argparse.ArgumentParser(description=description, - epilog=epilog) - spec_group = parser.add_mutually_exclusive_group(required=True) - spec_group.add_argument('--family', dest='family', type=str, - help='name of the netlink FAMILY') - spec_group.add_argument('--list-families', action='store_true', - help='list all netlink families supported by YNL (has spec)') - spec_group.add_argument('--spec', dest='spec', type=str, - help='choose the family by SPEC file path') - - parser.add_argument('--schema', dest='schema', type=str) - parser.add_argument('--no-schema', action='store_true') - parser.add_argument('--json', dest='json_text', type=str) - - group = parser.add_mutually_exclusive_group() - group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) - group.add_argument('--multi', dest='multi', nargs=2, action='append', - metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) - group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) - group.add_argument('--list-ops', action='store_true') - group.add_argument('--list-msgs', action='store_true') - group.add_argument('--list-attrs', dest='list_attrs', metavar='OPERATION', type=str, - help='List attributes for an operation') - group.add_argument('--validate', action='store_true') - - parser.add_argument('--duration', dest='duration', type=int, - help='when subscribed, watch for DURATION seconds') - parser.add_argument('--sleep', dest='duration', type=int, - help='alias for duration') - parser.add_argument('--subscribe', dest='ntf', type=str) - parser.add_argument('--replace', dest='flags', action='append_const', - const=Netlink.NLM_F_REPLACE) - parser.add_argument('--excl', dest='flags', action='append_const', - const=Netlink.NLM_F_EXCL) - parser.add_argument('--create', dest='flags', action='append_const', - const=Netlink.NLM_F_CREATE) - parser.add_argument('--append', dest='flags', action='append_const', - const=Netlink.NLM_F_APPEND) - parser.add_argument('--process-unknown', action=argparse.BooleanOptionalAction) - parser.add_argument('--output-json', action='store_true') - parser.add_argument('--dbg-small-recv', default=0, const=4000, - action='store', nargs='?', type=int) + epilog=epilog, add_help=False) + + gen_group = parser.add_argument_group('General options') + gen_group.add_argument('-h', '--help', action='help', + help='show this help message and exit') + + spec_group = parser.add_argument_group('Netlink family selection') + spec_sel = spec_group.add_mutually_exclusive_group(required=True) + spec_sel.add_argument('--list-families', action='store_true', + help=('list Netlink families supported by YNL ' + '(which have a spec available in the standard ' + 'system path)')) + spec_sel.add_argument('--family', dest='family', type=str, + help='name of the Netlink FAMILY to use') + spec_sel.add_argument('--spec', dest='spec', type=str, + help='full file path to the YAML spec file') + + ops_group = parser.add_argument_group('Operations') + ops = ops_group.add_mutually_exclusive_group() + ops.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) + ops.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + ops.add_argument('--multi', dest='multi', nargs=2, action='append', + metavar=('DO-OPERATION', 'JSON_TEXT'), type=str, + help="Multi-message operation sequence (for nftables)") + ops.add_argument('--list-ops', action='store_true', + help="List available --do and --dump operations") + ops.add_argument('--list-msgs', action='store_true', + help="List all messages of the family (incl. notifications)") + ops.add_argument('--list-attrs', dest='list_attrs', metavar='MSG', + type=str, help='List attributes for a message / operation') + ops.add_argument('--validate', action='store_true', + help="Validate the spec against schema and exit") + + io_group = parser.add_argument_group('Input / Output') + io_group.add_argument('--json', dest='json_text', type=str, + help=('Specify attributes of the message to send ' + 'to the kernel in JSON format. Can be left out ' + 'if the message is expected to be empty.')) + io_group.add_argument('--output-json', action='store_true', + help='Format output as JSON') + + ntf_group = parser.add_argument_group('Notifications') + ntf_group.add_argument('--subscribe', dest='ntf', type=str) + ntf_group.add_argument('--duration', dest='duration', type=int, + help='when subscribed, watch for DURATION seconds') + ntf_group.add_argument('--sleep', dest='duration', type=int, + help='alias for duration') + + nlflags = parser.add_argument_group('Netlink message flags (NLM_F_*)', + ('Extra flags to set in nlmsg_flags of ' + 'the request, used mostly by older ' + 'Classic Netlink families.')) + nlflags.add_argument('--replace', dest='flags', action='append_const', + const=Netlink.NLM_F_REPLACE) + nlflags.add_argument('--excl', dest='flags', action='append_const', + const=Netlink.NLM_F_EXCL) + nlflags.add_argument('--create', dest='flags', action='append_const', + const=Netlink.NLM_F_CREATE) + nlflags.add_argument('--append', dest='flags', action='append_const', + const=Netlink.NLM_F_APPEND) + + schema_group = parser.add_argument_group('Development options') + schema_group.add_argument('--schema', dest='schema', type=str, + help="JSON schema to validate the spec") + schema_group.add_argument('--no-schema', action='store_true') + + dbg_group = parser.add_argument_group('Debug options') + dbg_group.add_argument('--dbg-small-recv', default=0, const=4000, + action='store', nargs='?', type=int, metavar='INT', + help="Length of buffers used for recv()") + dbg_group.add_argument('--process-unknown', action=argparse.BooleanOptionalAction) + args = parser.parse_args() def output(msg): -- cgit v1.2.3 From aca1fe235c10f7d06e9ebab4534852f109e6a8e9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:39 -0800 Subject: tools: ynl: cli: add --doc as alias to --list-attrs --list-attrs also provides information about the operation itself. So --doc seems more appropriate. Add an alias. Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 3aa1f1e816bf..4147c498b479 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -179,7 +179,7 @@ def main(): help="List available --do and --dump operations") ops.add_argument('--list-msgs', action='store_true', help="List all messages of the family (incl. notifications)") - ops.add_argument('--list-attrs', dest='list_attrs', metavar='MSG', + ops.add_argument('--list-attrs', '--doc', dest='list_attrs', metavar='MSG', type=str, help='List attributes for a message / operation') ops.add_argument('--validate', action='store_true', help="Validate the spec against schema and exit") -- cgit v1.2.3 From 45b99bb464eb62da555ecbef31583d9701881d43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:40 -0800 Subject: tools: ynl: cli: factor out --list-attrs / --doc handling We'll soon add more code to the --doc handling. Factor it out to avoid making main() too long. Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 4147c498b479..6975efa7874f 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -137,6 +137,25 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True): print_attr_list(ynl, mode_spec['attributes'], attr_set) +def do_doc(ynl, op): + """Handle --list-attrs $op, print the attr information to stdout""" + print(f'Operation: {color(op.name, Colors.BOLD)}') + print(op.yaml['doc']) + + for mode in ['do', 'dump', 'event']: + if mode in op.yaml: + print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True) + + if 'notify' in op.yaml: + mode_spec = op.yaml['notify'] + ref_spec = ynl.msgs.get(mode_spec).yaml.get('do') + if ref_spec: + print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False) + + if 'mcgrp' in op.yaml: + print(f"\nMulticast group: {op.yaml['mcgrp']}") + + # pylint: disable=too-many-locals,too-many-branches,too-many-statements def main(): """YNL cli tool""" @@ -286,21 +305,7 @@ def main(): print(f'Operation {args.list_attrs} not found') sys.exit(1) - print(f'Operation: {color(op.name, Colors.BOLD)}') - print(op.yaml['doc']) - - for mode in ['do', 'dump', 'event']: - if mode in op.yaml: - print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True) - - if 'notify' in op.yaml: - mode_spec = op.yaml['notify'] - ref_spec = ynl.msgs.get(mode_spec).yaml.get('do') - if ref_spec: - print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False) - - if 'mcgrp' in op.yaml: - print(f"\nMulticast group: {op.yaml['mcgrp']}") + do_doc(ynl, op) try: if args.do: -- cgit v1.2.3 From 6ccc421b14613aac32b2647462ae4d40f5dd43b8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:41 -0800 Subject: tools: ynl: cli: extract the event/notify handling in --list-attrs Event and notify handling is quite different from do / dump handling. Forcing it into print_mode_attrs() doesn't really buy us anything as events and notifications do not have requests. Call print_attr_list() directly. Apart form subjective code clarity this also removes the word "reply" from the output: Before: Event reply attributes: Now: Event attributes: Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 6975efa7874f..6d2f0412dde0 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -120,11 +120,11 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): print_attr_list(ynl, nested_names, nested_set, indent + 4) -def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True): +def print_mode_attrs(ynl, mode, mode_spec, attr_set): """Print a given mode (do/dump/event/notify).""" mode_title = mode.capitalize() - if print_request and 'request' in mode_spec and 'attributes' in mode_spec['request']: + if 'request' in mode_spec and 'attributes' in mode_spec['request']: print(f'\n{mode_title} request attributes:') print_attr_list(ynl, mode_spec['request']['attributes'], attr_set) @@ -132,25 +132,28 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True): print(f'\n{mode_title} reply attributes:') print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set) - if 'attributes' in mode_spec: - print(f'\n{mode_title} attributes:') - print_attr_list(ynl, mode_spec['attributes'], attr_set) - def do_doc(ynl, op): """Handle --list-attrs $op, print the attr information to stdout""" print(f'Operation: {color(op.name, Colors.BOLD)}') print(op.yaml['doc']) - for mode in ['do', 'dump', 'event']: + for mode in ['do', 'dump']: if mode in op.yaml: - print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True) + print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set) + + if 'attributes' in op.yaml.get('event', {}): + print('\nEvent attributes:') + print_attr_list(ynl, op.yaml['event']['attributes'], op.attr_set) if 'notify' in op.yaml: mode_spec = op.yaml['notify'] ref_spec = ynl.msgs.get(mode_spec).yaml.get('do') + if not ref_spec: + ref_spec = ynl.msgs.get(mode_spec).yaml.get('dump') if ref_spec: - print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False) + print('\nNotification attributes:') + print_attr_list(ynl, ref_spec['reply']['attributes'], op.attr_set) if 'mcgrp' in op.yaml: print(f"\nMulticast group: {op.yaml['mcgrp']}") -- cgit v1.2.3 From 60411adedf70abec0ac221ec3d88f6453b031dd2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 10 Jan 2026 15:31:42 -0800 Subject: tools: ynl: cli: print reply in combined format if possible As pointed out during review of the --list-attrs support the GET ops very often return the same attrs from do and dump. Make the output more readable by combining the reply information, from: Do request attributes: - ifindex: u32 netdev ifindex Do reply attributes: - ifindex: u32 netdev ifindex [ .. other attrs .. ] Dump reply attributes: - ifindex: u32 netdev ifindex [ .. other attrs .. ] To, after: Do request attributes: - ifindex: u32 netdev ifindex Do and Dump reply attributes: - ifindex: u32 netdev ifindex [ .. other attrs .. ] Tested-by: Gal Pressman Acked-by: Stanislav Fomichev Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20260110233142.3921386-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/cli.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 6d2f0412dde0..fdac1ab10a40 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -120,7 +120,7 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): print_attr_list(ynl, nested_names, nested_set, indent + 4) -def print_mode_attrs(ynl, mode, mode_spec, attr_set): +def print_mode_attrs(ynl, mode, mode_spec, attr_set, consistent_dd_reply=None): """Print a given mode (do/dump/event/notify).""" mode_title = mode.capitalize() @@ -129,8 +129,15 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set): print_attr_list(ynl, mode_spec['request']['attributes'], attr_set) if 'reply' in mode_spec and 'attributes' in mode_spec['reply']: - print(f'\n{mode_title} reply attributes:') - print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set) + if consistent_dd_reply and mode == "do": + title = None # Dump handling will print in combined format + elif consistent_dd_reply and mode == "dump": + title = 'Do and Dump' + else: + title = f'{mode_title}' + if title: + print(f'\n{title} reply attributes:') + print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set) def do_doc(ynl, op): @@ -138,9 +145,15 @@ def do_doc(ynl, op): print(f'Operation: {color(op.name, Colors.BOLD)}') print(op.yaml['doc']) + consistent_dd_reply = False + if 'do' in op.yaml and 'dump' in op.yaml and 'reply' in op.yaml['do'] and \ + op.yaml['do']['reply'] == op.yaml['dump'].get('reply'): + consistent_dd_reply = True + for mode in ['do', 'dump']: if mode in op.yaml: - print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set) + print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, + consistent_dd_reply=consistent_dd_reply) if 'attributes' in op.yaml.get('event', {}): print('\nEvent attributes:') -- cgit v1.2.3 From 65955a0993a0a9536263fea2eaae8aed496dcc9c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 00:05:02 +0200 Subject: selftests: ublk: add stop command with --safe option Add 'stop' subcommand to kublk utility that uses the new UBLK_CMD_TRY_STOP_DEV command when --safe option is specified. This allows stopping a device only if it has no active openers, returning -EBUSY otherwise. Also add test_generic_16.sh to test the new functionality. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/kublk.c | 53 +++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 1 + tools/testing/selftests/ublk/test_generic_16.sh | 57 +++++++++++++++++++++++++ 4 files changed, 112 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_generic_16.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 036a9f01b464..3a2498089b15 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -23,6 +23,7 @@ TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh +TEST_PROGS += test_generic_16.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index d95937dd6167..3472ce7426ba 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -108,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev) return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_TRY_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + static int ublk_ctrl_start_dev(struct ublk_dev *dev, int daemon_pid) { @@ -1424,6 +1433,42 @@ static int cmd_dev_del(struct dev_ctx *ctx) return 0; } +static int cmd_dev_stop(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + if (number < 0) { + ublk_err("%s: device id is required\n", __func__); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + if (ctx->safe_stop) { + ret = ublk_ctrl_try_stop_dev(dev); + if (ret < 0) + ublk_err("%s: try_stop dev %d failed ret %d\n", + __func__, number, ret); + } else { + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", + __func__, number, ret); + } + +fail: + ublk_ctrl_deinit(dev); + + return ret; +} + static int __cmd_dev_list(struct dev_ctx *ctx) { struct ublk_dev *dev = ublk_ctrl_init(); @@ -1487,6 +1532,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), + FEAT_NAME(UBLK_F_SAFE_STOP_DEV) }; struct ublk_dev *dev; __u64 features = 0; @@ -1616,6 +1662,8 @@ static int cmd_dev_help(char *exe) printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n\n"); + printf("%s stop -n dev_id [--safe]\n", exe); + printf("\t --safe only stop if device has no active openers\n\n"); printf("%s list [-n dev_id] -a \n", exe); printf("\t -a list all devices, -n list specified device, default -a \n\n"); printf("%s features\n", exe); @@ -1653,6 +1701,7 @@ int main(int argc, char *argv[]) { "pi_offset", 1, NULL, 0 }, { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, + { "safe", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1760,6 +1809,8 @@ int main(int argc, char *argv[]) } if (!strcmp(longopts[option_idx].name, "tag_size")) ctx.tag_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "safe")) + ctx.safe_stop = 1; break; case '?': /* @@ -1842,6 +1893,8 @@ int main(int argc, char *argv[]) } } else if (!strcmp(cmd, "del")) ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "stop")) + ret = cmd_dev_stop(&ctx); else if (!strcmp(cmd, "list")) { ctx.all = 1; ret = cmd_dev_list(&ctx); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 96c66b337bc0..cb757fd9bf9d 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -83,6 +83,7 @@ struct dev_ctx { __u8 pi_offset; __u8 csum_type; __u8 tag_size; + unsigned int safe_stop:1; int _evtfd; int _shmid; diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh new file mode 100755 index 000000000000..e08af7b685c9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_16" +ERR_CODE=0 + +_prep_test "null" "stop --safe command" + +# Check if SAFE_STOP_DEV feature is supported +if ! _have_feature "SAFE_STOP_DEV"; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# Test 1: stop --safe on idle device should succeed +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Device is idle (no openers), stop --safe should succeed +if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then + echo "stop --safe on idle device failed unexpectedly!" + ERR_CODE=255 +fi + +# Clean up device +${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +udevadm settle + +# Test 2: stop --safe on device with active opener should fail +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Open device in background (dd reads indefinitely) +dd if=/dev/ublkb${dev_id} of=/dev/null bs=4k iflag=direct > /dev/null 2>&1 & +dd_pid=$! + +# Give dd time to start +sleep 0.2 + +# Device has active opener, stop --safe should fail with -EBUSY +if ${UBLK_PROG} stop -n "${dev_id}" --safe 2>/dev/null; then + echo "stop --safe on busy device succeeded unexpectedly!" + ERR_CODE=255 +fi + +# Kill dd and clean up +kill $dd_pid 2>/dev/null +wait $dd_pid 2>/dev/null + +# Now device should be idle, regular delete should work +${UBLK_PROG} del -n "${dev_id}" +udevadm settle + +_cleanup_test "null" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From ba7f1024a1024f219a18c40b4ab2d7d900fd2d15 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:52 +0000 Subject: selftests/bpf: Use the correct destructor kfunc type With CONFIG_CFI enabled, the kernel strictly enforces that indirect function calls use a function pointer type that matches the target function. As bpf_testmod_ctx_release() signature differs from the btf_dtor_kfunc_t pointer type used for the destructor calls in bpf_obj_free_fields(), add a stub function with the correct type to fix the type mismatch. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260110082548.113748-9-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1c41d03bd5a1..bc07ce9d5477 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -285,6 +285,12 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) call_rcu(&ctx->rcu, testmod_free_cb); } +__bpf_kfunc void bpf_testmod_ctx_release_dtor(void *ctx) +{ + bpf_testmod_ctx_release(ctx); +} +CFI_NOSEAL(bpf_testmod_ctx_release_dtor); + static struct bpf_testmod_ops3 *st_ops3; static int bpf_testmod_test_3(void) @@ -707,7 +713,7 @@ BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) BTF_ID(struct, bpf_testmod_ctx) -BTF_ID(func, bpf_testmod_ctx_release) +BTF_ID(func, bpf_testmod_ctx_release_dtor) static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = { .owner = THIS_MODULE, -- cgit v1.2.3 From 088f35ab9fd4a03b8c6ccdda7b92461d92bf7b8b Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Fri, 9 Jan 2026 20:52:01 +0530 Subject: selftests/net/ipsec: Fix variable size type not at the end of struct The "struct alg" object contains a union of 3 xfrm structures: union { struct xfrm_algo; struct xfrm_algo_aead; struct xfrm_algo_auth; } All of them end with a flexible array member used to store key material, but the flexible array appears at *different offsets* in each struct. bcz of this, union itself is of variable-sized & Placing it above char buf[...] triggers: ipsec.c:835:5: warning: field 'u' with variable sized type 'union (unnamed union at ipsec.c:831:3)' not at the end of a struct or class is a GNU extension [-Wgnu-variable-sized-type-not-at-end] 835 | } u; | ^ one fix is to use "TRAILING_OVERLAP()" which works with one flexible array member only. But In "struct alg" flexible array member exists in all union members, but not at the same offset, so TRAILING_OVERLAP cannot be applied. so the fix is to explicitly overlay the key buffer at the correct offset for the largest union member (xfrm_algo_auth). This ensures that the flexible-array region and the fixed buffer line up. No functional change. Reviewed-by: Simon Horman Signed-off-by: Ankit Khushwaha Link: https://patch.msgid.link/20260109152201.15668-1-ankitkhushwaha.linux@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/ipsec.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index 0ccf484b1d9d..f4afef51b930 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -43,6 +43,10 @@ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#ifndef offsetof +#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + #define IPV4_STR_SZ 16 /* xxx.xxx.xxx.xxx is longest + \0 */ #define MAX_PAYLOAD 2048 #define XFRM_ALGO_KEY_BUF_SIZE 512 @@ -827,13 +831,16 @@ static int xfrm_fill_key(char *name, char *buf, static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz, struct xfrm_desc *desc) { - struct { + union { union { struct xfrm_algo alg; struct xfrm_algo_aead aead; struct xfrm_algo_auth auth; } u; - char buf[XFRM_ALGO_KEY_BUF_SIZE]; + struct { + unsigned char __offset_to_FAM[offsetof(struct xfrm_algo_auth, alg_key)]; + char buf[XFRM_ALGO_KEY_BUF_SIZE]; + }; } alg = {}; size_t alen, elen, clen, aelen; unsigned short type; -- cgit v1.2.3 From 6ea8a206108fe8b5940c2797afc54ae9f5a7bbdd Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Mon, 12 Jan 2026 21:26:41 +0200 Subject: rtla: Fix parse_cpu_set() bug introduced by strtoi() The patch 'Replace atoi() with a robust strtoi()' introduced a bug in parse_cpu_set(), which relies on partial parsing of the input string. The function parses CPU specifications like '0-3,5' by incrementing a pointer through the string. strtoi() rejects strings with trailing characters, causing parse_cpu_set() to fail on any CPU list with multiple entries. Restore the original use of atoi() in parse_cpu_set(). Fixes: 7e9dfccf8f11 ("rtla: Replace atoi() with a robust strtoi()") Signed-off-by: Costa Shulyupin Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20260112192642.212848-2-costa.shul@redhat.com Signed-off-by: Tomas Glozar --- tools/tracing/rtla/src/utils.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 18986a5aed3c..0da3b2470c31 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -128,18 +128,16 @@ int parse_cpu_set(char *cpu_list, cpu_set_t *set) nr_cpus = sysconf(_SC_NPROCESSORS_CONF); for (p = cpu_list; *p; ) { - if (strtoi(p, &cpu)) - goto err; - if (cpu < 0 || cpu >= nr_cpus) + cpu = atoi(p); + if (cpu < 0 || (!cpu && *p != '0') || cpu >= nr_cpus) goto err; while (isdigit(*p)) p++; if (*p == '-') { p++; - if (strtoi(p, &end_cpu)) - goto err; - if (end_cpu < cpu || end_cpu >= nr_cpus) + end_cpu = atoi(p); + if (end_cpu < cpu || (!end_cpu && *p != '0') || end_cpu >= nr_cpus) goto err; while (isdigit(*p)) p++; -- cgit v1.2.3 From 602544773763da411ffa67567fa1d146f3a40231 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jan 2026 16:31:09 -0800 Subject: uapi: promote EFSCORRUPTED and EUCLEAN to errno.h Stop definining these privately and instead move them to the uapi errno.h so that they become canonical instead of copy pasta. Cc: linux-api@vger.kernel.org Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/176826402587.3490369.17659117524205214600.stgit@frogsfrogsfrogs Reviewed-by: Gao Xiang Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- arch/alpha/include/uapi/asm/errno.h | 2 ++ arch/mips/include/uapi/asm/errno.h | 2 ++ arch/parisc/include/uapi/asm/errno.h | 2 ++ arch/sparc/include/uapi/asm/errno.h | 2 ++ fs/erofs/internal.h | 2 -- fs/ext2/ext2.h | 1 - fs/ext4/ext4.h | 3 --- fs/f2fs/f2fs.h | 3 --- fs/minix/minix.h | 2 -- fs/udf/udf_sb.h | 2 -- fs/xfs/xfs_linux.h | 2 -- include/linux/jbd2.h | 3 --- include/uapi/asm-generic/errno.h | 2 ++ tools/arch/alpha/include/uapi/asm/errno.h | 2 ++ tools/arch/mips/include/uapi/asm/errno.h | 2 ++ tools/arch/parisc/include/uapi/asm/errno.h | 2 ++ tools/arch/sparc/include/uapi/asm/errno.h | 2 ++ tools/include/uapi/asm-generic/errno.h | 2 ++ 18 files changed, 20 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h index 3d265f6babaf..6791f6508632 100644 --- a/arch/alpha/include/uapi/asm/errno.h +++ b/arch/alpha/include/uapi/asm/errno.h @@ -55,6 +55,7 @@ #define ENOSR 82 /* Out of streams resources */ #define ETIME 83 /* Timer expired */ #define EBADMSG 84 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EPROTO 85 /* Protocol error */ #define ENODATA 86 /* No data available */ #define ENOSTR 87 /* Device not a stream */ @@ -96,6 +97,7 @@ #define EREMCHG 115 /* Remote address changed */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h index 2fb714e2d6d8..c01ed91b1ef4 100644 --- a/arch/mips/include/uapi/asm/errno.h +++ b/arch/mips/include/uapi/asm/errno.h @@ -50,6 +50,7 @@ #define EDOTDOT 73 /* RFS specific error */ #define EMULTIHOP 74 /* Multihop attempted */ #define EBADMSG 77 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define ENAMETOOLONG 78 /* File name too long */ #define EOVERFLOW 79 /* Value too large for defined data type */ #define ENOTUNIQ 80 /* Name not unique on network */ @@ -88,6 +89,7 @@ #define EISCONN 133 /* Transport endpoint is already connected */ #define ENOTCONN 134 /* Transport endpoint is not connected */ #define EUCLEAN 135 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 137 /* Not a XENIX named type file */ #define ENAVAIL 138 /* No XENIX semaphores available */ #define EISNAM 139 /* Is a named type file */ diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index 8d94739d75c6..8cbc07c1903e 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -36,6 +36,7 @@ #define EDOTDOT 66 /* RFS specific error */ #define EBADMSG 67 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ #define ESTALE 70 /* Stale file handle */ @@ -62,6 +63,7 @@ #define ERESTART 175 /* Interrupted system call should be restarted */ #define ESTRPIPE 176 /* Streams pipe error */ #define EUCLEAN 177 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 178 /* Not a XENIX named type file */ #define ENAVAIL 179 /* No XENIX semaphores available */ #define EISNAM 180 /* Is a named type file */ diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h index 81a732b902ee..4a41e7835fd5 100644 --- a/arch/sparc/include/uapi/asm/errno.h +++ b/arch/sparc/include/uapi/asm/errno.h @@ -48,6 +48,7 @@ #define ENOSR 74 /* Out of streams resources */ #define ENOMSG 75 /* No message of desired type */ #define EBADMSG 76 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EIDRM 77 /* Identifier removed */ #define EDEADLK 78 /* Resource deadlock would occur */ #define ENOLCK 79 /* No record locks available */ @@ -91,6 +92,7 @@ #define ENOTUNIQ 115 /* Name not unique on network */ #define ERESTART 116 /* Interrupted syscall should be restarted */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f7f622836198..d06e99baf5d5 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -541,6 +541,4 @@ long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long erofs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cf97b76e9fd3..5e0c6c5fcb6c 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -357,7 +357,6 @@ struct ext2_inode { */ #define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT2_ERROR_FS 0x0002 /* Errors detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ /* * Mount flags diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 56112f201cac..62c091b52bac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3938,7 +3938,4 @@ extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, get_block_t *get_block); #endif /* __KERNEL__ */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _EXT4_H */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20edbb99b814..9f3aa3c7f126 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -5004,7 +5004,4 @@ static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, f2fs_invalidate_compress_pages_range(sbi, blkaddr, len); } -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _LINUX_F2FS_H */ diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 2bfaf377f208..7e1f652f16d3 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -175,6 +175,4 @@ static inline int minix_test_bit(int nr, const void *vaddr) __minix_error_inode((inode), __func__, __LINE__, \ (fmt), ##__VA_ARGS__) -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* FS_MINIX_H */ diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 08ec8756b948..8399accc788d 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -55,8 +55,6 @@ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 -#define EFSCORRUPTED EUCLEAN - struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 4dd747bdbcca..55064228c4d5 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -121,8 +121,6 @@ typedef __u32 xfs_nlink_t; #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define __return_address __builtin_return_address(0) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f5eaf76198f3..a53a00d36228 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1815,7 +1815,4 @@ static inline int jbd2_handle_buffer_credits(handle_t *handle) #endif /* __KERNEL__ */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _LINUX_JBD2_H */ diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h index cf9c51ac49f9..92e7ae493ee3 100644 --- a/include/uapi/asm-generic/errno.h +++ b/include/uapi/asm-generic/errno.h @@ -55,6 +55,7 @@ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ @@ -98,6 +99,7 @@ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h index 3d265f6babaf..6791f6508632 100644 --- a/tools/arch/alpha/include/uapi/asm/errno.h +++ b/tools/arch/alpha/include/uapi/asm/errno.h @@ -55,6 +55,7 @@ #define ENOSR 82 /* Out of streams resources */ #define ETIME 83 /* Timer expired */ #define EBADMSG 84 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EPROTO 85 /* Protocol error */ #define ENODATA 86 /* No data available */ #define ENOSTR 87 /* Device not a stream */ @@ -96,6 +97,7 @@ #define EREMCHG 115 /* Remote address changed */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h index 2fb714e2d6d8..c01ed91b1ef4 100644 --- a/tools/arch/mips/include/uapi/asm/errno.h +++ b/tools/arch/mips/include/uapi/asm/errno.h @@ -50,6 +50,7 @@ #define EDOTDOT 73 /* RFS specific error */ #define EMULTIHOP 74 /* Multihop attempted */ #define EBADMSG 77 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define ENAMETOOLONG 78 /* File name too long */ #define EOVERFLOW 79 /* Value too large for defined data type */ #define ENOTUNIQ 80 /* Name not unique on network */ @@ -88,6 +89,7 @@ #define EISCONN 133 /* Transport endpoint is already connected */ #define ENOTCONN 134 /* Transport endpoint is not connected */ #define EUCLEAN 135 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 137 /* Not a XENIX named type file */ #define ENAVAIL 138 /* No XENIX semaphores available */ #define EISNAM 139 /* Is a named type file */ diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h index 8d94739d75c6..8cbc07c1903e 100644 --- a/tools/arch/parisc/include/uapi/asm/errno.h +++ b/tools/arch/parisc/include/uapi/asm/errno.h @@ -36,6 +36,7 @@ #define EDOTDOT 66 /* RFS specific error */ #define EBADMSG 67 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ #define ESTALE 70 /* Stale file handle */ @@ -62,6 +63,7 @@ #define ERESTART 175 /* Interrupted system call should be restarted */ #define ESTRPIPE 176 /* Streams pipe error */ #define EUCLEAN 177 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 178 /* Not a XENIX named type file */ #define ENAVAIL 179 /* No XENIX semaphores available */ #define EISNAM 180 /* Is a named type file */ diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h index 81a732b902ee..4a41e7835fd5 100644 --- a/tools/arch/sparc/include/uapi/asm/errno.h +++ b/tools/arch/sparc/include/uapi/asm/errno.h @@ -48,6 +48,7 @@ #define ENOSR 74 /* Out of streams resources */ #define ENOMSG 75 /* No message of desired type */ #define EBADMSG 76 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EIDRM 77 /* Identifier removed */ #define EDEADLK 78 /* Resource deadlock would occur */ #define ENOLCK 79 /* No record locks available */ @@ -91,6 +92,7 @@ #define ENOTUNIQ 115 /* Name not unique on network */ #define ERESTART 116 /* Interrupted syscall should be restarted */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h index cf9c51ac49f9..92e7ae493ee3 100644 --- a/tools/include/uapi/asm-generic/errno.h +++ b/tools/include/uapi/asm-generic/errno.h @@ -55,6 +55,7 @@ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ @@ -98,6 +99,7 @@ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ -- cgit v1.2.3 From 26bea10450afe5ad4dd0e0bbb797c44e1df110fe Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 6 Jan 2026 12:13:15 +0100 Subject: objtool: fix compilation failure with the x32 toolchain When using the x32 toolchain, compilation fails because the printf specifier "%lx" (long), doesn't match the type of the "checksum" variable (long long). Fix this by changing the printf specifier to "%llx" and casting "checksum" to unsigned long long. Fixes: a3493b33384a ("objtool/klp: Add --debug-checksum= to show per-instruction checksums") Signed-off-by: Mikulas Patocka Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/a1158c99-fe0e-a218-4b5b-ffac212489f6@redhat.com --- tools/objtool/include/objtool/warn.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h index 25ff7942b4d5..2b27b54096b8 100644 --- a/tools/objtool/include/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -152,8 +152,8 @@ static inline void unindent(int *unused) { indent--; } if (unlikely(insn->sym && insn->sym->pfunc && \ insn->sym->pfunc->debug_checksum)) { \ char *insn_off = offstr(insn->sec, insn->offset); \ - __dbg("checksum: %s %s %016lx", \ - func->name, insn_off, checksum); \ + __dbg("checksum: %s %s %016llx", \ + func->name, insn_off, (unsigned long long)checksum);\ free(insn_off); \ } \ }) -- cgit v1.2.3 From 436326bc525d467e38db1da576139ec5f28268c5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 23 Dec 2025 07:03:57 -0500 Subject: objtool: fix build failure due to missing libopcodes check Commit 59953303827e ("objtool: Disassemble code with libopcodes instead of running objdump") added support for using libopcodes for disassembly. However, the feature detection checks for libbfd availability but then unconditionally links against libopcodes: ifeq ($(feature-libbfd),1) OBJTOOL_LDFLAGS += -lopcodes endif This causes build failures in environments where libbfd is installed but libopcodes is not, since the test-libbfd.c feature test only links against -lbfd and -ldl, not -lopcodes: /usr/bin/ld: cannot find -lopcodes: No such file or directory collect2: error: ld returned 1 exit status make[4]: *** [Makefile:109: objtool] Error 1 Additionally, the shared feature framework uses $(CC) which is the cross-compiler in cross-compilation builds. Since objtool is a host tool that links with $(HOSTCC) against host libraries, the feature detection can falsely report libopcodes as available when the cross-compiler's sysroot has it but the host system doesn't. Fix this by replacing the feature framework check with a direct inline test that uses $(HOSTCC) to compile and link a test program against libopcodes, similar to how xxhash availability is detected. Fixes: 59953303827e ("objtool: Disassemble code with libopcodes instead of running objdump") Assisted-by: claude-opus-4-5-20251101 Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251223120357.2492008-1-sashal@kernel.org --- tools/objtool/Makefile | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index ad6e1ec706ce..9b4503113ce5 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -72,23 +72,27 @@ HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" # # To support disassembly, objtool needs libopcodes which is provided -# with libbdf (binutils-dev or binutils-devel package). +# with libbfd (binutils-dev or binutils-devel package). # -FEATURE_USER = .objtool -FEATURE_TESTS = libbfd disassembler-init-styled -FEATURE_DISPLAY = -include $(srctree)/tools/build/Makefile.feature +# We check using HOSTCC directly rather than the shared feature framework +# because objtool is a host tool that links against host libraries. +# +HAVE_LIBOPCODES := $(shell echo 'int main(void) { return 0; }' | \ + $(HOSTCC) -xc - -o /dev/null -lopcodes 2>/dev/null && echo y) -ifeq ($(feature-disassembler-init-styled), 1) - OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED -endif +# Styled disassembler support requires binutils >= 2.39 +HAVE_DISASM_STYLED := $(shell echo '$(pound)include ' | \ + $(HOSTCC) -E -xc - 2>/dev/null | grep -q disassembler_style && echo y) BUILD_DISAS := n -ifeq ($(feature-libbfd),1) +ifeq ($(HAVE_LIBOPCODES),y) BUILD_DISAS := y - OBJTOOL_CFLAGS += -DDISAS -DPACKAGE="objtool" + OBJTOOL_CFLAGS += -DDISAS -DPACKAGE='"objtool"' OBJTOOL_LDFLAGS += -lopcodes +ifeq ($(HAVE_DISASM_STYLED),y) + OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED +endif endif export BUILD_DISAS -- cgit v1.2.3 From 8d61f1a9f2541c6ef51d4997e6a4c5a1c0d8b27c Mon Sep 17 00:00:00 2001 From: Jonas Köppeler Date: Fri, 9 Jan 2026 14:15:35 +0100 Subject: selftests/tc-testing: add selftests for cake_mq qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test 684b: Create CAKE_MQ with default setting (4 queues) Test 7ee8: Create CAKE_MQ with bandwidth limit (4 queues) Test 1f87: Create CAKE_MQ with rtt time (4 queues) Test e9cf: Create CAKE_MQ with besteffort flag (4 queues) Test 7c05: Create CAKE_MQ with diffserv8 flag (4 queues) Test 5a77: Create CAKE_MQ with diffserv4 flag (4 queues) Test 8f7a: Create CAKE_MQ with flowblind flag (4 queues) Test 7ef7: Create CAKE_MQ with dsthost and nat flag (4 queues) Test 2e4d: Create CAKE_MQ with wash flag (4 queues) Test b3e6: Create CAKE_MQ with flowblind and no-split-gso flag (4 queues) Test 62cd: Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues) Test 0df3: Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues) Test 9a75: Create CAKE_MQ with memlimit and ptm flag (4 queues) Test cdef: Create CAKE_MQ with fwmark and atm flag (4 queues) Test 93dd: Create CAKE_MQ with overhead 0 and mpu (4 queues) Test 1475: Create CAKE_MQ with conservative and ingress flag (4 queues) Test 7bf1: Delete CAKE_MQ with conservative and ingress flag (4 queues) Test ee55: Replace CAKE_MQ with mpu (4 queues) Test 6df9: Change CAKE_MQ with mpu (4 queues) Test 67e2: Show CAKE_MQ class (4 queues) Test 2de4: Change bandwidth of CAKE_MQ (4 queues) Test 5f62: Fail to create CAKE_MQ with autorate-ingress flag (4 queues) Test 038e: Fail to change setting of sub-qdisc under CAKE_MQ Test 7bdc: Fail to replace sub-qdisc under CAKE_MQ Test 18e0: Fail to install CAKE_MQ on single queue device Reviewed-by: Victor Nogueira Signed-off-by: Jonas Köppeler Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260109-mq-cake-sub-qdisc-v8-6-8d613fece5d8@redhat.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/qdiscs/cake_mq.json | 559 +++++++++++++++++++++ 1 file changed, 559 insertions(+) create mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json new file mode 100644 index 000000000000..0efe229fb86e --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json @@ -0,0 +1,559 @@ +[ + { + "id": "684b", + "name": "Create CAKE_MQ with default setting (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1\" > /sys/bus/netdevsim/del_device || true", + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7ee8", + "name": "Create CAKE_MQ with bandwidth limit (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq bandwidth 1000", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "1f87", + "name": "Create CAKE_MQ with rtt time (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq rtt 200", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 200us raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "e9cf", + "name": "Create CAKE_MQ with besteffort flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq besteffort", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited besteffort triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7c05", + "name": "Create CAKE_MQ with diffserv8 flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv8", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv8 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "5a77", + "name": "Create CAKE_MQ with diffserv4 flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv4", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv4 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "8f7a", + "name": "Create CAKE_MQ with flowblind flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7ef7", + "name": "Create CAKE_MQ with dsthost and nat flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dsthost nat", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dsthost nat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2e4d", + "name": "Create CAKE_MQ with wash flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq hosts wash", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 hosts nonat wash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "b3e6", + "name": "Create CAKE_MQ with flowblind and no-split-gso flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind no-split-gso", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter no-split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "62cd", + "name": "Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-srchost ack-filter", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-srchost nonat nowash ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "0df3", + "name": "Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-dsthost ack-filter-aggressive", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-dsthost nonat nowash ack-filter-aggressive split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "9a75", + "name": "Create CAKE_MQ with memlimit and ptm flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq memlimit 10000 ptm", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw ptm overhead 0 memlimit 10000b ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "cdef", + "name": "Create CAKE_MQ with fwmark and atm flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq fwmark 8 atm", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw atm overhead 0 fwmark 0x8 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "93dd", + "name": "Create CAKE_MQ with overhead 0 and mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 256 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "1475", + "name": "Create CAKE_MQ with conservative and ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7bf1", + "name": "Delete CAKE_MQ with conservative and ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH handle 1: root", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "ee55", + "name": "Replace CAKE_MQ with mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256" + ], + "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq mpu 128", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "6df9", + "name": "Change CAKE_MQ with mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256" + ], + "cmdUnderTest": "$TC qdisc change dev $ETH handle 1: root cake_mq mpu 128", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "67e2", + "name": "Show CAKE_MQ class (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "0", + "verifyCmd": "$TC class show dev $ETH", + "matchPattern": "class cake_mq", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2de4", + "name": "Change bandwidth of CAKE_MQ (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq bandwidth 1000", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "5f62", + "name": "Fail to create CAKE_MQ with autorate-ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq autorate-ingress", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited autorate-ingress diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "038e", + "name": "Fail to change setting of sub-qdisc under CAKE_MQ", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 cake besteffort flows", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7bdc", + "name": "Fail to replace sub-qdisc under CAKE_MQ", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 fq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "18e0", + "name": "Fail to install CAKE_MQ on single queue device", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 1\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + } +] -- cgit v1.2.3 From f88dc319fcb6d6a155e94469a355ce456dd85441 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 5 Jan 2026 12:05:17 +0100 Subject: objtool: Allow multiple pv_ops arrays Having a single large pv_ops array has the main disadvantage of needing all prototypes of the single array members in one header file. This is adding up to the need to include lots of otherwise unrelated headers. In order to allow multiple smaller pv_ops arrays dedicated to one area of the kernel each, allow multiple arrays in objtool. For better performance limit the possible names of the arrays to start with "pv_ops". Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260105110520.21356-19-jgross@suse.com --- tools/objtool/arch/x86/decode.c | 8 +++- tools/objtool/check.c | 74 +++++++++++++++++++++++++++-------- tools/objtool/include/objtool/check.h | 1 + 3 files changed, 65 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index f4af82508228..73bfea220d1b 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -711,10 +711,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec immr = find_reloc_by_dest(elf, (void *)sec, offset+3); disp = find_reloc_by_dest(elf, (void *)sec, offset+7); - if (!immr || strcmp(immr->sym->name, "pv_ops")) + if (!immr || strncmp(immr->sym->name, "pv_ops", 6)) break; - idx = (reloc_addend(immr) + 8) / sizeof(void *); + idx = pv_ops_idx_off(immr->sym->name); + if (idx < 0) + break; + + idx += (reloc_addend(immr) + 8) / sizeof(void *); func = disp->sym; if (disp->sym->type == STT_SECTION) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 61f3e0c48fcc..b3fec88d5bd3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -520,21 +520,57 @@ static int decode_instructions(struct objtool_file *file) } /* - * Read the pv_ops[] .data table to find the static initialized values. + * Known pv_ops*[] arrays. */ -static int add_pv_ops(struct objtool_file *file, const char *symname) +static struct { + const char *name; + int idx_off; +} pv_ops_tables[] = { + { .name = "pv_ops", }, + { .name = NULL, .idx_off = -1 } +}; + +/* + * Get index offset for a pv_ops* array. + */ +int pv_ops_idx_off(const char *symname) +{ + int idx; + + for (idx = 0; pv_ops_tables[idx].name; idx++) { + if (!strcmp(symname, pv_ops_tables[idx].name)) + break; + } + + return pv_ops_tables[idx].idx_off; +} + +/* + * Read a pv_ops*[] .data table to find the static initialized values. + */ +static int add_pv_ops(struct objtool_file *file, int pv_ops_idx) { struct symbol *sym, *func; unsigned long off, end; struct reloc *reloc; - int idx; + int idx, idx_off; + const char *symname; + symname = pv_ops_tables[pv_ops_idx].name; sym = find_symbol_by_name(file->elf, symname); - if (!sym) - return 0; + if (!sym) { + ERROR("Unknown pv_ops array %s", symname); + return -1; + } off = sym->offset; end = off + sym->len; + idx_off = pv_ops_tables[pv_ops_idx].idx_off; + if (idx_off < 0) { + ERROR("pv_ops array %s has unknown index offset", symname); + return -1; + } + for (;;) { reloc = find_reloc_by_dest_range(file->elf, sym->sec, off, end - off); if (!reloc) @@ -552,7 +588,7 @@ static int add_pv_ops(struct objtool_file *file, const char *symname) return -1; } - if (objtool_pv_add(file, idx, func)) + if (objtool_pv_add(file, idx + idx_off, func)) return -1; off = reloc_offset(reloc) + 1; @@ -568,11 +604,6 @@ static int add_pv_ops(struct objtool_file *file, const char *symname) */ static int init_pv_ops(struct objtool_file *file) { - static const char *pv_ops_tables[] = { - "pv_ops", - NULL, - }; - const char *pv_ops; struct symbol *sym; int idx, nr; @@ -581,11 +612,20 @@ static int init_pv_ops(struct objtool_file *file) file->pv_ops = NULL; - sym = find_symbol_by_name(file->elf, "pv_ops"); - if (!sym) + nr = 0; + for (idx = 0; pv_ops_tables[idx].name; idx++) { + sym = find_symbol_by_name(file->elf, pv_ops_tables[idx].name); + if (!sym) { + pv_ops_tables[idx].idx_off = -1; + continue; + } + pv_ops_tables[idx].idx_off = nr; + nr += sym->len / sizeof(unsigned long); + } + + if (nr == 0) return 0; - nr = sym->len / sizeof(unsigned long); file->pv_ops = calloc(nr, sizeof(struct pv_state)); if (!file->pv_ops) { ERROR_GLIBC("calloc"); @@ -595,8 +635,10 @@ static int init_pv_ops(struct objtool_file *file) for (idx = 0; idx < nr; idx++) INIT_LIST_HEAD(&file->pv_ops[idx].targets); - for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) { - if (add_pv_ops(file, pv_ops)) + for (idx = 0; pv_ops_tables[idx].name; idx++) { + if (pv_ops_tables[idx].idx_off < 0) + continue; + if (add_pv_ops(file, idx)) return -1; } diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 2e1346ad5e92..5f2f77bd9b41 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -159,5 +159,6 @@ const char *objtool_disas_insn(struct instruction *insn); extern size_t sym_name_max_len; extern struct disas_context *objtool_disas_ctx; +int pv_ops_idx_off(const char *symname); #endif /* _CHECK_H */ -- cgit v1.2.3 From 609e359ab904698f1e5aa0ab2fee2f4c29ee0886 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 07:59:13 +0100 Subject: selftests: vDSO: vdso_config: Add configurations for clock_getres_time64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures will start to implement this function. Make sure that tests can be written for it. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-2-97ea7a06a543@linutronix.de --- tools/testing/selftests/vDSO/vdso_config.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 50c261005111..5da223731b81 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -66,7 +66,7 @@ static const char *versions[7] = { }; __attribute__((unused)) -static const char *names[2][7] = { +static const char *names[2][8] = { { "__kernel_gettimeofday", "__kernel_clock_gettime", @@ -75,6 +75,7 @@ static const char *names[2][7] = { "__kernel_getcpu", "__kernel_clock_gettime64", "__kernel_getrandom", + "__kernel_clock_getres_time64", }, { "__vdso_gettimeofday", @@ -84,6 +85,7 @@ static const char *names[2][7] = { "__vdso_getcpu", "__vdso_clock_gettime64", "__vdso_getrandom", + "__vdso_clock_getres_time64", }, }; -- cgit v1.2.3 From 1dcd1273add368c2b7c65135e22b416e1b374781 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 07:59:14 +0100 Subject: selftests: vDSO: vdso_test_abi: Use UAPI system call numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SYS_clock_getres might have been redirected by libc to some other system call than the actual clock_getres. For testing it is required to use exactly this system call. Use the system call number exported by the UAPI headers which is always correct. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-3-97ea7a06a543@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index c620317eaeea..a75c12dcb0f1 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -179,7 +179,7 @@ static void vdso_test_clock_getres(clockid_t clk_id) clock_getres_fail++; } - ret = syscall(SYS_clock_getres, clk_id, &sys_ts); + ret = syscall(__NR_clock_getres, clk_id, &sys_ts); ksft_print_msg("The syscall resolution is %lld %lld\n", (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); -- cgit v1.2.3 From 4e6a2312986d437cc22805b9e08f86b15fee0318 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 07:59:15 +0100 Subject: selftests: vDSO: vdso_test_abi: Add test for clock_getres_time64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures will start to implement this function. Make sure it works correctly. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-4-97ea7a06a543@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 53 +++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a75c12dcb0f1..b162a4ba9c4f 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -36,6 +36,7 @@ typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_getres_time64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef time_t (*vdso_time_t)(time_t *t); static const char * const vdso_clock_name[] = { @@ -196,6 +197,55 @@ static void vdso_test_clock_getres(clockid_t clk_id) } } +#ifdef __NR_clock_getres_time64 +static void vdso_test_clock_getres_time64(clockid_t clk_id) +{ + int clock_getres_fail = 0; + + /* Find clock_getres. */ + vdso_clock_getres_time64_t vdso_clock_getres_time64 = + (vdso_clock_getres_time64_t)vdso_sym(version, name[7]); + + if (!vdso_clock_getres_time64) { + ksft_print_msg("Couldn't find %s\n", name[7]); + ksft_test_result_skip("%s %s\n", name[7], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts, sys_ts; + long ret = VDSO_CALL(vdso_clock_getres_time64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The vdso resolution is %lld %lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + } else { + clock_getres_fail++; + } + + ret = syscall(__NR_clock_getres_time64, clk_id, &sys_ts); + + ksft_print_msg("The syscall resolution is %lld %lld\n", + (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); + + if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) + clock_getres_fail++; + + if (clock_getres_fail > 0) { + ksft_test_result_fail("%s %s\n", name[7], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_pass("%s %s\n", name[7], + vdso_clock_name[clk_id]); + } +} +#else /* !__NR_clock_getres_time64 */ +static void vdso_test_clock_getres_time64(clockid_t clk_id) +{ + ksft_test_result_skip("%s %s\n", name[7], vdso_clock_name[clk_id]); +} +#endif /* __NR_clock_getres_time64 */ + /* * This function calls vdso_test_clock_gettime and vdso_test_clock_getres * with different values for clock_id. @@ -208,9 +258,10 @@ static inline void vdso_test_clock(clockid_t clock_id) vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); + vdso_test_clock_getres_time64(clock_id); } -#define VDSO_TEST_PLAN 29 +#define VDSO_TEST_PLAN 38 int main(int argc, char **argv) { -- cgit v1.2.3 From b0b449e6fec4cd182bd4384f7eb9002596079f68 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 5 Jan 2026 12:05:20 +0100 Subject: x86/pvlocks: Move paravirt spinlock functions into own header Instead of having the pv spinlock function definitions in paravirt.h, move them into the new header paravirt-spinlock.h. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260105110520.21356-22-jgross@suse.com --- arch/x86/hyperv/hv_spinlock.c | 10 +-- arch/x86/include/asm/paravirt-base.h | 6 ++ arch/x86/include/asm/paravirt-spinlock.h | 145 +++++++++++++++++++++++++++++++ arch/x86/include/asm/paravirt.h | 61 ------------- arch/x86/include/asm/paravirt_types.h | 17 ---- arch/x86/include/asm/qspinlock.h | 87 ++----------------- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/kvm.c | 12 +-- arch/x86/kernel/paravirt-spinlocks.c | 26 +++++- arch/x86/kernel/paravirt.c | 21 ----- arch/x86/xen/spinlock.c | 10 +-- tools/objtool/check.c | 1 + 12 files changed, 198 insertions(+), 200 deletions(-) create mode 100644 arch/x86/include/asm/paravirt-spinlock.h (limited to 'tools') diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index 2a3c2afb0154..210b494e4de0 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -78,11 +78,11 @@ void __init hv_init_spinlocks(void) pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); - pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_ops.lock.wait = hv_qlock_wait; - pv_ops.lock.kick = hv_qlock_kick; - pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted); + pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops_lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops_lock.wait = hv_qlock_wait; + pv_ops_lock.kick = hv_qlock_kick; + pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted); } static __init int hv_parse_nopvspin(char *arg) diff --git a/arch/x86/include/asm/paravirt-base.h b/arch/x86/include/asm/paravirt-base.h index 3827ea20de18..982a0b93bc76 100644 --- a/arch/x86/include/asm/paravirt-base.h +++ b/arch/x86/include/asm/paravirt-base.h @@ -26,4 +26,10 @@ u64 _paravirt_ident_64(u64); #endif #define paravirt_nop ((void *)nop_func) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void paravirt_set_cap(void); +#else +static inline void paravirt_set_cap(void) { } +#endif + #endif /* _ASM_X86_PARAVIRT_BASE_H */ diff --git a/arch/x86/include/asm/paravirt-spinlock.h b/arch/x86/include/asm/paravirt-spinlock.h new file mode 100644 index 000000000000..a5011ef3a6cc --- /dev/null +++ b/arch/x86/include/asm/paravirt-spinlock.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_X86_PARAVIRT_SPINLOCK_H +#define _ASM_X86_PARAVIRT_SPINLOCK_H + +#include + +#ifdef CONFIG_SMP +#include +#endif + +struct qspinlock; + +struct pv_lock_ops { + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + struct paravirt_callee_save queued_spin_unlock; + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); + + struct paravirt_callee_save vcpu_is_preempted; +} __no_randomize_layout; + +extern struct pv_lock_ops pv_ops_lock; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); +extern bool nopvspin; + +static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, + u32 val) +{ + PVOP_VCALL2(pv_ops_lock, queued_spin_lock_slowpath, lock, val); +} + +static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + PVOP_ALT_VCALLEE1(pv_ops_lock, queued_spin_unlock, lock, + "movb $0, (%%" _ASM_ARG1 ");", + ALT_NOT(X86_FEATURE_PVUNLOCK)); +} + +static __always_inline bool pv_vcpu_is_preempted(long cpu) +{ + return PVOP_ALT_CALLEE1(bool, pv_ops_lock, vcpu_is_preempted, cpu, + "xor %%" _ASM_AX ", %%" _ASM_AX ";", + ALT_NOT(X86_FEATURE_VCPUPREEMPT)); +} + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void native_queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release(&lock->locked, 0); +} + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + kcsan_release(); + pv_queued_spin_unlock(lock); +} + +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(long cpu) +{ + return pv_vcpu_is_preempted(cpu); +} + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + PVOP_VCALL2(pv_ops_lock, wait, ptr, val); +} + +static __always_inline void pv_kick(int cpu) +{ + PVOP_VCALL1(pv_ops_lock, kick, cpu); +} + +void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); +bool __raw_callee_save___native_vcpu_is_preempted(long cpu); +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +void __init native_pv_lock_init(void); +__visible void __native_queued_spin_unlock(struct qspinlock *lock); +bool pv_is_native_spin_unlock(void); +__visible bool __native_vcpu_is_preempted(long cpu); +bool pv_is_native_vcpu_is_preempted(void); + +/* + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. + * + * Native (and PV wanting native due to vCPU pinning) should keep this key + * disabled. Native does not touch the key. + * + * When in a guest then native_pv_lock_init() enables the key first and + * KVM/XEN might conditionally disable it later in the boot process again. + */ +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); + +/* + * Shortcut for the queued_spin_lock_slowpath() function that allows + * virt to hijack it. + * + * Returns: + * true - lock has been negotiated, all done; + * false - queued_spin_lock_slowpath() will do its thing. + */ +#define virt_spin_lock virt_spin_lock +static inline bool virt_spin_lock(struct qspinlock *lock) +{ + int val; + + if (!static_branch_likely(&virt_spin_lock_key)) + return false; + + /* + * On hypervisors without PARAVIRT_SPINLOCKS support we fall + * back to a Test-and-Set spinlock, because fair locks have + * horrible lock 'holder' preemption issues. + */ + + __retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } + + return true; +} + +#endif /* _ASM_X86_PARAVIRT_SPINLOCK_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ec274d13bae0..b21072af731d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,15 +19,6 @@ #include #include -__visible void __native_queued_spin_unlock(struct qspinlock *lock); -bool pv_is_native_spin_unlock(void); -__visible bool __native_vcpu_is_preempted(long cpu); -bool pv_is_native_vcpu_is_preempted(void); - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -void __init paravirt_set_cap(void); -#endif - /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { @@ -522,46 +513,7 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, { pv_ops.mmu.set_fixmap(idx, phys, flags); } -#endif - -#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) - -static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, - u32 val) -{ - PVOP_VCALL2(pv_ops, lock.queued_spin_lock_slowpath, lock, val); -} - -static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) -{ - PVOP_ALT_VCALLEE1(pv_ops, lock.queued_spin_unlock, lock, - "movb $0, (%%" _ASM_ARG1 ");", - ALT_NOT(X86_FEATURE_PVUNLOCK)); -} - -static __always_inline void pv_wait(u8 *ptr, u8 val) -{ - PVOP_VCALL2(pv_ops, lock.wait, ptr, val); -} - -static __always_inline void pv_kick(int cpu) -{ - PVOP_VCALL1(pv_ops, lock.kick, cpu); -} - -static __always_inline bool pv_vcpu_is_preempted(long cpu) -{ - return PVOP_ALT_CALLEE1(bool, pv_ops, lock.vcpu_is_preempted, cpu, - "xor %%" _ASM_AX ", %%" _ASM_AX ";", - ALT_NOT(X86_FEATURE_VCPUPREEMPT)); -} -void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); -bool __raw_callee_save___native_vcpu_is_preempted(long cpu); - -#endif /* SMP && PARAVIRT_SPINLOCKS */ - -#ifdef CONFIG_PARAVIRT_XXL static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, pv_ops, irq.save_fl, "pushf; pop %%rax;", @@ -588,8 +540,6 @@ static __always_inline unsigned long arch_local_irq_save(void) } #endif -void native_pv_lock_init(void) __init; - #else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 @@ -613,12 +563,6 @@ void native_pv_lock_init(void) __init; #endif /* __ASSEMBLER__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop - -#ifndef __ASSEMBLER__ -static inline void native_pv_lock_init(void) -{ -} -#endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLER__ @@ -634,10 +578,5 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) } #endif -#ifndef CONFIG_PARAVIRT_SPINLOCKS -static inline void paravirt_set_cap(void) -{ -} -#endif #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b36d425d099b..7ccd41628d36 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -184,22 +184,6 @@ struct pv_mmu_ops { #endif } __no_randomize_layout; -#ifdef CONFIG_SMP -#include -#endif - -struct qspinlock; - -struct pv_lock_ops { - void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); - struct paravirt_callee_save queued_spin_unlock; - - void (*wait)(u8 *ptr, u8 val); - void (*kick)(int cpu); - - struct paravirt_callee_save vcpu_is_preempted; -} __no_randomize_layout; - /* This contains all the paravirt structures: we get a convenient * number for each function using the offset which we use to indicate * what to patch. */ @@ -207,7 +191,6 @@ struct paravirt_patch_template { struct pv_cpu_ops cpu; struct pv_irq_ops irq; struct pv_mmu_ops mmu; - struct pv_lock_ops lock; } __no_randomize_layout; extern struct paravirt_patch_template pv_ops; diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 68da67df304d..25a1919542d9 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -7,6 +7,9 @@ #include #include #include +#ifdef CONFIG_PARAVIRT +#include +#endif #define _Q_PENDING_LOOPS (1 << 9) @@ -27,90 +30,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo return val; } -#ifdef CONFIG_PARAVIRT_SPINLOCKS -extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_init_lock_hash(void); -extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); -extern bool nopvspin; - -#define queued_spin_unlock queued_spin_unlock -/** - * queued_spin_unlock - release a queued spinlock - * @lock : Pointer to queued spinlock structure - * - * A smp_store_release() on the least-significant byte. - */ -static inline void native_queued_spin_unlock(struct qspinlock *lock) -{ - smp_store_release(&lock->locked, 0); -} - -static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) -{ - pv_queued_spin_lock_slowpath(lock, val); -} - -static inline void queued_spin_unlock(struct qspinlock *lock) -{ - kcsan_release(); - pv_queued_spin_unlock(lock); -} - -#define vcpu_is_preempted vcpu_is_preempted -static inline bool vcpu_is_preempted(long cpu) -{ - return pv_vcpu_is_preempted(cpu); -} +#ifndef CONFIG_PARAVIRT +static inline void native_pv_lock_init(void) { } #endif -#ifdef CONFIG_PARAVIRT -/* - * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. - * - * Native (and PV wanting native due to vCPU pinning) should keep this key - * disabled. Native does not touch the key. - * - * When in a guest then native_pv_lock_init() enables the key first and - * KVM/XEN might conditionally disable it later in the boot process again. - */ -DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); - -/* - * Shortcut for the queued_spin_lock_slowpath() function that allows - * virt to hijack it. - * - * Returns: - * true - lock has been negotiated, all done; - * false - queued_spin_lock_slowpath() will do its thing. - */ -#define virt_spin_lock virt_spin_lock -static inline bool virt_spin_lock(struct qspinlock *lock) -{ - int val; - - if (!static_branch_likely(&virt_spin_lock_key)) - return false; - - /* - * On hypervisors without PARAVIRT_SPINLOCKS support we fall - * back to a Test-and-Set spinlock, because fair locks have - * horrible lock 'holder' preemption issues. - */ - - __retry: - val = atomic_read(&lock->val); - - if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { - cpu_relax(); - goto __retry; - } - - return true; -} - -#endif /* CONFIG_PARAVIRT */ - #include #endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bc184dd38d99..e9aeeeafad17 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -126,7 +126,7 @@ obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o -obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT) += paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 21b4de55f823..de550b12d9ab 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -829,8 +829,10 @@ static void __init kvm_guest_init(void) has_steal_clock = 1; static_call_update(pv_steal_clock, kvm_steal_clock); - pv_ops.lock.vcpu_is_preempted = +#ifdef CONFIG_PARAVIRT_SPINLOCKS + pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); +#endif } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -1126,11 +1128,11 @@ void __init kvm_spinlock_init(void) pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); - pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_ops.lock.queued_spin_unlock = + pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops_lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_ops.lock.wait = kvm_wait; - pv_ops.lock.kick = kvm_kick_cpu; + pv_ops_lock.wait = kvm_wait; + pv_ops_lock.kick = kvm_kick_cpu; /* * When PV spinlock is enabled which is preferred over diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 9e1ea99ad9df..95452444868f 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -3,12 +3,22 @@ * Split spinlock implementation out into its own file, so it can be * compiled in a FTRACE-compatible way. */ +#include #include #include #include -#include +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); +#ifdef CONFIG_SMP +void __init native_pv_lock_init(void) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_enable(&virt_spin_lock_key); +} +#endif + +#ifdef CONFIG_PARAVIRT_SPINLOCKS __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); @@ -17,7 +27,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); bool pv_is_native_spin_unlock(void) { - return pv_ops.lock.queued_spin_unlock.func == + return pv_ops_lock.queued_spin_unlock.func == __raw_callee_save___native_queued_spin_unlock; } @@ -29,7 +39,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); bool pv_is_native_vcpu_is_preempted(void) { - return pv_ops.lock.vcpu_is_preempted.func == + return pv_ops_lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } @@ -41,3 +51,13 @@ void __init paravirt_set_cap(void) if (!pv_is_native_vcpu_is_preempted()) setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); } + +struct pv_lock_ops pv_ops_lock = { + .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), + .wait = paravirt_nop, + .kick = paravirt_nop, + .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), +}; +EXPORT_SYMBOL(pv_ops_lock); +#endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5dfbd3f55792..a6ed52cae003 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -57,14 +57,6 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif -DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); - -void __init native_pv_lock_init(void) -{ - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - static_branch_enable(&virt_spin_lock_key); -} - static noinstr void pv_native_safe_halt(void) { native_safe_halt(); @@ -221,19 +213,6 @@ struct paravirt_patch_template pv_ops = { .mmu.set_fixmap = native_set_fixmap, #endif /* CONFIG_PARAVIRT_XXL */ - -#if defined(CONFIG_PARAVIRT_SPINLOCKS) - /* Lock ops. */ -#ifdef CONFIG_SMP - .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, - .lock.queued_spin_unlock = - PV_CALLEE_SAVE(__native_queued_spin_unlock), - .lock.wait = paravirt_nop, - .lock.kick = paravirt_nop, - .lock.vcpu_is_preempted = - PV_CALLEE_SAVE(__native_vcpu_is_preempted), -#endif /* SMP */ -#endif }; #ifdef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index fe56646d6919..83ac24ead289 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -134,10 +134,10 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); __pv_init_lock_hash(); - pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_ops.lock.queued_spin_unlock = + pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops_lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_ops.lock.wait = xen_qlock_wait; - pv_ops.lock.kick = xen_qlock_kick; - pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); + pv_ops_lock.wait = xen_qlock_wait; + pv_ops_lock.kick = xen_qlock_kick; + pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b3fec88d5bd3..c2952df6842c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -527,6 +527,7 @@ static struct { int idx_off; } pv_ops_tables[] = { { .name = "pv_ops", }, + { .name = "pv_ops_lock", }, { .name = NULL, .idx_off = -1 } }; -- cgit v1.2.3 From 8441c7d3bd6c5a52ab2ecf77e43a5bf262004f5c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 7 Jan 2026 13:05:43 +0100 Subject: cxl: Check for invalid addresses returned from translation functions on errors Translation functions may return an invalid address in case of errors. If the address is not checked the further use of the invalid value will cause an address corruption. Consistently check for a valid address returned by translation functions. Use RESOURCE_SIZE_MAX to indicate an invalid address for type resource_size_t. Depending on the type either RESOURCE_SIZE_MAX or ULLONG_MAX is used to indicate an address error. Propagating an invalid address from a failed translation may cause userspace to think it has received a valid SPA, when in fact it is wrong. The CXL userspace API, using trace events, expects ULLONG_MAX to indicate a translation failure. If ULLONG_MAX is not returned immediately, subsequent calculations can transform that bad address into a different value (!ULLONG_MAX), and an invalid SPA may be returned to userspace. This can lead to incorrect diagnostics and erroneous corrective actions. [ dj: Added user impact statement from Alison. ] [ dj: Fixed checkpatch tab alignment issue. ] Reviewed-by: Dave Jiang Signed-off-by: Robert Richter Fixes: c3dd67681c70 ("cxl/region: Add inject and clear poison by region offset") Fixes: b78b9e7b7979 ("cxl/region: Refactor address translation funcs for testing") Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260107120544.410993-1-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 2 +- drivers/cxl/core/region.c | 34 ++++++++++++++++++++++++++-------- tools/testing/cxl/test/cxl_translate.c | 30 ++++++++++++++++++------------ 3 files changed, 45 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index a470099a69f1..eb5a3a7640c6 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -530,7 +530,7 @@ resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) { - resource_size_t base = -1; + resource_size_t base = RESOURCE_SIZE_MAX; lockdep_assert_held(&cxl_rwsem.dpa); if (cxled->dpa_res) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index fc36a5413d3f..5bd1213737fa 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3118,7 +3118,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; - u64 dpa_offset, hpa_offset, hpa; + u64 base, dpa_offset, hpa_offset, hpa; u16 eig = 0; u8 eiw = 0; int pos; @@ -3136,8 +3136,14 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); - dpa_offset = dpa - cxl_dpa_resource_start(cxled); + base = cxl_dpa_resource_start(cxled); + if (base == RESOURCE_SIZE_MAX) + return ULLONG_MAX; + + dpa_offset = dpa - base; hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); + if (hpa_offset == ULLONG_MAX) + return ULLONG_MAX; /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start + p->cache_size; @@ -3146,6 +3152,9 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, if (cxlrd->ops.hpa_to_spa) hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa); + if (hpa == ULLONG_MAX) + return ULLONG_MAX; + if (!cxl_resource_contains_addr(p->res, hpa)) { dev_dbg(&cxlr->dev, "Addr trans fail: hpa 0x%llx not in region\n", hpa); @@ -3170,7 +3179,8 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct cxl_region_params *p = &cxlr->params; struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_endpoint_decoder *cxled; - u64 hpa, hpa_offset, dpa_offset; + u64 hpa_offset = offset; + u64 dpa, dpa_offset; u16 eig = 0; u8 eiw = 0; int pos; @@ -3187,10 +3197,13 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, * CXL HPA is assumed to equal SPA. */ if (cxlrd->ops.spa_to_hpa) { - hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); - hpa_offset = hpa - p->res->start; - } else { - hpa_offset = offset; + hpa_offset = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); + if (hpa_offset == ULLONG_MAX) { + dev_dbg(&cxlr->dev, "HPA not found for %pr offset %#llx\n", + p->res, offset); + return -ENXIO; + } + hpa_offset -= p->res->start; } pos = cxl_calculate_position(hpa_offset, eiw, eig); @@ -3207,8 +3220,13 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, cxled = p->targets[i]; if (cxled->pos != pos) continue; + + dpa = cxl_dpa_resource_start(cxled); + if (dpa != RESOURCE_SIZE_MAX) + dpa += dpa_offset; + result->cxlmd = cxled_to_memdev(cxled); - result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset; + result->dpa = dpa; return 0; } diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c index 2200ae21795c..16328b2112b2 100644 --- a/tools/testing/cxl/test/cxl_translate.c +++ b/tools/testing/cxl/test/cxl_translate.c @@ -68,6 +68,8 @@ static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways, /* Calculate base HPA offset from DPA and position */ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig); + if (hpa_offset == ULLONG_MAX) + return ULLONG_MAX; if (math == XOR_MATH) { cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; @@ -258,19 +260,23 @@ static int test_random_params(void) pos = get_random_u32() % ways; dpa = get_random_u64() >> 12; + reverse_dpa = ULLONG_MAX; + reverse_pos = -1; + hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig); - reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); - reverse_pos = cxl_calculate_position(hpa, eiw, eig); - - if (reverse_dpa != dpa || reverse_pos != pos) { - pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", - i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, - eig); - - if (failures++ > 10) { - pr_err("test random too many failures, stop\n"); - break; - } + if (hpa != ULLONG_MAX) { + reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); + reverse_pos = cxl_calculate_position(hpa, eiw, eig); + if (reverse_dpa == dpa && reverse_pos == pos) + continue; + } + + pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", + i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, eig); + + if (failures++ > 10) { + pr_err("test random too many failures, stop\n"); + break; } } pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures); -- cgit v1.2.3 From 2465a08d433dd4ae0c4eecdb8e79c54b7c5e5a55 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:23 -0800 Subject: selftests/bpf: Fix dmabuf_iter/lots_of_buffers failure with 64K page On arm64 with 64K page , I observed the following test failure: ... subtest_dmabuf_iter_check_lots_of_buffers:FAIL:total_bytes_read unexpected total_bytes_read: actual 4696 <= expected 65536 #97/3 dmabuf_iter/lots_of_buffers:FAIL With 4K page on x86, the total_bytes_read is 4593. With 64K page on arm64, the total_byte_read is 4696. In progs/dmabuf_iter.c, for each iteration, the output is BPF_SEQ_PRINTF(seq, "%lu\n%llu\n%s\n%s\n", inode, size, name, exporter); The only difference between 4K and 64K page is 'size' in the above BPF_SEQ_PRINTF. The 4K page will output '4096' and the 64K page will output '65536'. So the total_bytes_read with 64K page is slighter greater than 4K page. Adjusting the total_bytes_read from 65536 to 4096 fixed the issue. Cc: T.J. Mercier Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260113061023.3798085-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c index e442be9dde7e..fb2cea710db3 100644 --- a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c @@ -233,7 +233,7 @@ static void subtest_dmabuf_iter_check_lots_of_buffers(struct dmabuf_iter *skel) while ((bytes_read = read(iter_fd, buf, sizeof(buf))) > 0) total_bytes_read += bytes_read; - ASSERT_GT(total_bytes_read, getpagesize(), "total_bytes_read"); + ASSERT_GT(total_bytes_read, 4096, "total_bytes_read"); close(iter_fd); } -- cgit v1.2.3 From d2f7cd20a7c7742b83d2c899044d4f2a851a4a7d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:28 -0800 Subject: selftests/bpf: Fix sk_bypass_prot_mem failure with 64K page The current selftest sk_bypass_prot_mem only supports 4K page. When running with 64K page on arm64, the following failure happens: ... check_bypass:FAIL:no bypass unexpected no bypass: actual 3 <= expected 32 ... #385/1 sk_bypass_prot_mem/TCP :FAIL ... check_bypass:FAIL:no bypass unexpected no bypass: actual 4 <= expected 32 ... #385/2 sk_bypass_prot_mem/UDP :FAIL ... Adding support to 64K page as well fixed the failure. Cc: Kuniyuki Iwashima Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260113061028.3798326-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c index e4940583924b..e2c867fd5244 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c @@ -5,9 +5,14 @@ #include "sk_bypass_prot_mem.skel.h" #include "network_helpers.h" +#ifndef PAGE_SIZE +#include +#define PAGE_SIZE getpagesize() +#endif + #define NR_PAGES 32 #define NR_SOCKETS 2 -#define BUF_TOTAL (NR_PAGES * 4096 / NR_SOCKETS) +#define BUF_TOTAL (NR_PAGES * PAGE_SIZE / NR_SOCKETS) #define BUF_SINGLE 1024 #define NR_SEND (BUF_TOTAL / BUF_SINGLE) -- cgit v1.2.3 From 951d79017e8a0af6db4a167a89746b4a0924626b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:33 -0800 Subject: selftests/bpf: Fix verifier_arena_globals1 failure with 64K page With 64K page on arm64, verifier_arena_globals1 failed like below: ... libbpf: map 'arena': failed to create: -E2BIG ... #509/1 verifier_arena_globals1/check_reserve1:FAIL ... For 64K page, if the number of arena pages is (1UL << 20), the total memory will exceed 4G and this will cause map creation failure. Adjusting ARENA_PAGES based on the actual page size fixed the problem. Cc: Emil Tsalapatis Signed-off-by: Yonghong Song Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260113061033.3798549-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_globals1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c index 14afef3d6442..83182ddbfb95 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -9,7 +9,7 @@ #include "bpf_arena_common.h" #include "bpf_misc.h" -#define ARENA_PAGES (1UL<< (32 - 12)) +#define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1)) #define GLOBAL_PAGES (16) struct { -- cgit v1.2.3 From 9160335317cb404f54ad2f509546c666ddd4d0eb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 12 Jan 2026 12:13:58 -0800 Subject: selftests/bpf: Add tests for s>>=31 and s>>=63 Add tests for special arithmetic shift right. Signed-off-by: Alexei Starovoitov Co-developed-by: Puranjay Mohan Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260112201424.816836-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_subreg.c | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index b3e1c3eef9ae..be328100ba53 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -738,4 +738,89 @@ __naked void ldx_w_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_and(void) +{ + /* Below is what LLVM generates in cilium's bpf_wiregard.o */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 &= -134; /* w2 becomes 0 or -134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if w2 != -136 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_and(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 &= -134; \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if r2 != -136 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_or(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 |= 134; /* w2 becomes -1 or 134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if w2 == -1 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_or(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 |= 134; /* r2 becomes -1 or 134 */ \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if r2 == -1 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a2297e74a07d21eb498d8549ae6fddc35cf26ec6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:35 -0800 Subject: perf srcline: Add configuration support for the addr2line style Allow the addr2line style to be specified on the `perf report` command line or in the .perfconfig file. Committer testing: The methods: # perf probe -x ~/bin/perf -F *__addr2line cmd__addr2line libbfd__addr2line libdw__addr2line llvm__addr2line # So if we configure one of them, say 'addr2line': # perf config addr2line.style=addr2line # perf config addr2line.style addr2line.style=addr2line # And have probes on all of them: # perf probe -x ~/bin/perf *__addr2line Added new events: probe_perf:cmd__addr2line (on *__addr2line in /home/acme/bin/perf) probe_perf:llvm__addr2line (on *__addr2line in /home/acme/bin/perf) probe_perf:libbfd__addr2line (on *__addr2line in /home/acme/bin/perf) probe_perf:libdw__addr2line (on *__addr2line in /home/acme/bin/perf) You can now use it in all perf tools, such as: perf record -e probe_perf:libdw__addr2line -aR sleep 1 # Only the selected method should be used: # perf stat -e probe_perf:*_addr2line perf report -f --dso perf --stdio -s srcfile,srcline # Total Lost Samples: 0 # # Samples: 4K of event 'cpu/cycles/Pu' # Event count (approx.): 5535180842 # # Overhead Source File Source:Line # ........ ............ ............... # 99.04% inlineloop.c inlineloop.c:21 0.46% inlineloop.c inlineloop.c:20 # # (Tip: For hierarchical output, try: perf report --hierarchy) # Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline': 44 probe_perf:cmd__addr2line 0 probe_perf:llvm__addr2line 0 probe_perf:libbfd__addr2line 0 probe_perf:libdw__addr2line 0.035915611 seconds time elapsed 0.028008000 seconds user 0.009051000 seconds sys # I checked and that is the case for the other methods. Also when using: # perf config addr2line.style=libdw,llvm Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline': 0 probe_perf:cmd__addr2line 23 probe_perf:llvm__addr2line 0 probe_perf:libbfd__addr2line 44 probe_perf:libdw__addr2line Reviewed-by: James Clark Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 10 +++++ tools/perf/util/config.c | 4 ++ tools/perf/util/srcline.c | 98 ++++++++++++++++++++++++++++++++++++++----- tools/perf/util/srcline.h | 2 + tools/perf/util/symbol_conf.h | 10 +++++ 5 files changed, 113 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 6c2b4f93ec78..2e936928e8c0 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1271,6 +1271,13 @@ parse_percent_limit(const struct option *opt, const char *str, return 0; } +static int +report_parse_addr2line_config(const struct option *opt __maybe_unused, + const char *arg, int unset __maybe_unused) +{ + return addr2line_configure("addr2line.style", arg, NULL); +} + static int process_attr(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist) @@ -1447,6 +1454,9 @@ int cmd_report(int argc, const char **argv) "objdump binary to use for disassembly and annotations"), OPT_STRING(0, "addr2line", &addr2line_path, "path", "addr2line binary to use for line numbers"), + OPT_CALLBACK(0, "addr2line-style", NULL, "addr2line style", + "addr2line styles (libdw,llvm,libbfd,addr2line)", + report_parse_addr2line_config), OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle, "Symbol demangling. Enabled by default, use --no-demangle to disable."), OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index e0219bc6330a..0452fbc6c085 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -20,6 +20,7 @@ #include "util/stat.h" /* perf_stat__set_big_num */ #include "util/evsel.h" /* evsel__hw_names, evsel__use_bpf_counters */ #include "util/addr2line.h" /* addr2line_timeout_ms */ +#include "srcline.h" #include "build-id.h" #include "debug.h" #include "config.h" @@ -519,6 +520,9 @@ int perf_default_config(const char *var, const char *value, if (strstarts(var, "stat.")) return perf_stat_config(var, value); + if (strstarts(var, "addr2line.")) + return addr2line_configure(var, value, dummy); + /* Add other config variables here. */ return 0; } diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index e2d280678b02..28fa1abd1fd3 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -7,9 +7,11 @@ #include "llvm.h" #include "symbol.h" #include "libdw.h" +#include "debug.h" #include #include +#include bool srcline_full_filename; @@ -138,21 +140,95 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int * struct dso *dso, bool unwind_inlines, struct inline_node *node, struct symbol *sym) { - int ret; + int ret = 0; + + if (symbol_conf.addr2line_style[0] == A2L_STYLE_UNKNOWN) { + int i = 0; + + /* Default addr2line fallback order. */ +#ifdef HAVE_LIBDW_SUPPORT + symbol_conf.addr2line_style[i++] = A2L_STYLE_LIBDW; +#endif +#ifdef HAVE_LIBLLVM_SUPPORT + symbol_conf.addr2line_style[i++] = A2L_STYLE_LLVM; +#endif +#ifdef HAVE_LIBBFD_SUPPORT + symbol_conf.addr2line_style[i++] = A2L_STYLE_LIBBFD; +#endif + symbol_conf.addr2line_style[i++] = A2L_STYLE_CMD; + } + + for (size_t i = 0; i < ARRAY_SIZE(symbol_conf.addr2line_style); i++) { + switch (symbol_conf.addr2line_style[i]) { + case A2L_STYLE_LIBDW: + ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, + node, sym); + break; + case A2L_STYLE_LLVM: + ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, + node, sym); + break; + case A2L_STYLE_LIBBFD: + ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, + node, sym); + break; + case A2L_STYLE_CMD: + ret = cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, + node, sym); + break; + case A2L_STYLE_UNKNOWN: + default: + break; + } + if (ret > 0) + return ret; + } + + return 0; +} + +int addr2line_configure(const char *var, const char *value, void *cb __maybe_unused) +{ + static const char * const a2l_style_names[] = { + [A2L_STYLE_LIBDW] = "libdw", + [A2L_STYLE_LLVM] = "llvm", + [A2L_STYLE_LIBBFD] = "libbfd", + [A2L_STYLE_CMD] = "addr2line", + NULL + }; + + char *s, *p, *saveptr; + size_t i = 0; - ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); - if (ret > 0) - return ret; + if (strcmp(var, "addr2line.style")) + return 0; + + if (!value) + return -1; - ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); - if (ret > 0) - return ret; + s = strdup(value); + if (!s) + return -1; - ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); - if (ret > 0) - return ret; + p = strtok_r(s, ",", &saveptr); + while (p && i < ARRAY_SIZE(symbol_conf.addr2line_style)) { + bool found = false; + char *q = strim(p); + + for (size_t j = A2L_STYLE_LIBDW; j < MAX_A2L_STYLE; j++) { + if (!strcasecmp(q, a2l_style_names[j])) { + symbol_conf.addr2line_style[i++] = j; + found = true; + break; + } + } + if (!found) + pr_warning("Unknown addr2line style: %s\n", q); + p = strtok_r(NULL, ",", &saveptr); + } - return cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); + free(s); + return 0; } static struct inline_node *addr2inlines(const char *dso_name, u64 addr, diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index be9f002bf234..7c37b3bf9ce7 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -63,4 +63,6 @@ struct symbol *new_inline_sym(struct dso *dso, struct symbol *base_sym, const char *funcname); +int addr2line_configure(const char *var, const char *value, void *cb); + #endif /* PERF_SRCLINE_H */ diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index 7a80d2c14d9b..71bb17372a6c 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -9,6 +9,15 @@ struct strlist; struct intlist; +enum a2l_style { + A2L_STYLE_UNKNOWN = 0, + A2L_STYLE_LIBDW, + A2L_STYLE_LLVM, + A2L_STYLE_LIBBFD, + A2L_STYLE_CMD, +}; +#define MAX_A2L_STYLE (A2L_STYLE_CMD + 1) + struct symbol_conf { bool nanosecs; unsigned short priv_size; @@ -70,6 +79,7 @@ struct symbol_conf { *col_width_list_str, *bt_stop_list_str; const char *addr2line_path; + enum a2l_style addr2line_style[MAX_A2L_STYLE]; unsigned long time_quantum; struct strlist *dso_list, *comm_list, -- cgit v1.2.3 From abec464767b5d26f0612250d511c18f420826ca1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:36 -0800 Subject: perf callchain: Fix srcline printing with inlines sample__fprintf_callchain() was using map__fprintf_srcline() which won't report inline line numbers. Fix by using the srcline from the callchain and falling back to the map variant. Fixes: 25da4fab5f66e659 ("perf evsel: Move fprintf methods to separate source file") Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel_fprintf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 10f1a03c2860..5521d00bff2c 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -185,8 +185,12 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, if (print_dso && (!sym || !sym->inlined)) printed += map__fprintf_dsoname_dsoff(map, print_dsoff, addr, fp); - if (print_srcline) - printed += map__fprintf_srcline(map, addr, "\n ", fp); + if (print_srcline) { + if (node->srcline) + printed += fprintf(fp, "\n %s", node->srcline); + else + printed += map__fprintf_srcline(map, addr, "\n ", fp); + } if (sym && sym->inlined) printed += fprintf(fp, " (inlined)"); -- cgit v1.2.3 From 54a23bff770961e024e2c61cd1f46888190c3e79 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Jan 2026 20:13:38 -0800 Subject: perf test: Test addr2line unwinding works with inline functions Add a test that seeks to see inline functions correctly displayed in 'perf script' from the inlineloop workload. Committer testing: # perf test 'addr2line inline unwinding' 76: test addr2line inline unwinding : Ok # perf test -vv 'addr2line inline unwinding' 76: test addr2line inline unwinding: --- start --- test child forked, pid 1508628 Inline unwinding verification test [ perf record: Woken up 129 times to write data ] [ perf record: Captured and wrote 32.282 MB /tmp/perf-test-inline-addr2line.L4Sz8QtADJ/perf.data (4014 samples) ] Inline unwinding verification test [Success] ---- end(0) ---- 76: test addr2line inline unwinding : Ok # Reviewed-by: James Clark Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Tony Jones Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/addr2line_inlines.sh | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 tools/perf/tests/shell/addr2line_inlines.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/addr2line_inlines.sh b/tools/perf/tests/shell/addr2line_inlines.sh new file mode 100755 index 000000000000..4a5b6f5be23d --- /dev/null +++ b/tools/perf/tests/shell/addr2line_inlines.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# test addr2line inline unwinding +# SPDX-License-Identifier: GPL-2.0 + +set -e + +err=0 +test_dir=$(mktemp -d /tmp/perf-test-inline-addr2line.XXXXXXXXXX) +perf_data="${test_dir}/perf.data" +perf_script_txt="${test_dir}/perf_script.txt" + +cleanup() { + rm -rf "${test_dir}" + trap - EXIT TERM INT +} + +trap_cleanup() { + echo "Unexpected signal in ${FUNCNAME[1]}" + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + +test_inlinedloop() { + echo "Inline unwinding verification test" + # Record data. Currently only dwarf callchains support inlined functions. + perf record --call-graph dwarf -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1 + + # Check output with inline (default) and srcline + perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}" + + # Expect the leaf and middle functions to occur on lines in the 20s, with + # the non-inlined parent function on a line in the 30s. + if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" && + grep -q "inlineloop.c:3.$" "${perf_script_txt}" + then + echo "Inline unwinding verification test [Success]" + else + echo "Inline unwinding verification test [Failed missing inlined functions]" + err=1 + fi +} + +test_inlinedloop + +cleanup +exit $err -- cgit v1.2.3 From b6ee9b6e206b288921c14c906eebf4b32fe0c0d8 Mon Sep 17 00:00:00 2001 From: Sri Jayaramappa Date: Tue, 2 Dec 2025 16:36:32 -0500 Subject: libsubcmd: Fix null intersection case in exclude_cmds() When there is no exclusion occurring from the cmds list - for example - cmds contains ["read-vdso32"] and excludes contains ["archive"] - the main loop completes with ci == cj == 0. In the original code the loop processing the remaining elements in the list was conditional: if (ci != cj) { ...} So we end up in the assertion loop since ci < cmds->cnt and we incorrectly try to assert the list elements to be NULL and fail with the following error help.c:104: exclude_cmds: Assertion `cmds->names[ci] == NULL' failed. Fix this by moving the if (ci != cj) check inside of a broader loop. If ci != cj, left shift the list elements, as before, and then unconditionally advance the ci and cj indicies which also covers the ci == cj case. Fixes: 1fdf938168c4d26f ("perf tools: Fix use-after-free in help_unknown_cmd()") Reviewed-by: Guilherme Amadio Signed-off-by: Sri Jayaramappa Tested-by: Guilherme Amadio Tested-by: Ian Rogers Cc: Joshua Hunt Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20251202213632.2873731-1-sjayaram@akamai.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/help.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index ddaeb4eb3e24..db94aa685b73 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -97,11 +97,13 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) ei++; } } - if (ci != cj) { - while (ci < cmds->cnt) { - cmds->names[cj++] = cmds->names[ci]; - cmds->names[ci++] = NULL; + while (ci < cmds->cnt) { + if (ci != cj) { + cmds->names[cj] = cmds->names[ci]; + cmds->names[ci] = NULL; } + ci++; + cj++; } for (ci = cj; ci < cmds->cnt; ci++) assert(cmds->names[ci] == NULL); -- cgit v1.2.3 From c65182ef9df6bb96fd85b56a2bcdd18d64c4d3b5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 12 Jan 2026 11:33:39 -0500 Subject: selftests: net: reduce txtimestamp deschedule flakes This test occasionally fails due to exceeding timing bounds, as run in continuous testing on netdev.bots: https://netdev.bots.linux.dev/contest.html?test=txtimestamp-sh A common pattern is a single elevated delay between USR and SND. # 8.36 [+0.00] test SND # 8.36 [+0.00] USR: 1767864384 s 240994 us (seq=0, len=0) # 8.44 [+0.08] ERROR: 18461 us expected between 10000 and 18000 # 8.44 [+0.00] SND: 1767864384 s 259455 us (seq=42, len=10) (USR +18460 us) # 8.52 [+0.07] SND: 1767864384 s 339523 us (seq=42, len=10) (USR +10005 us) # 8.52 [+0.00] USR: 1767864384 s 409580 us (seq=0, len=0) # 8.60 [+0.08] SND: 1767864384 s 419586 us (seq=42, len=10) (USR +10005 us) # 8.60 [+0.00] USR: 1767864384 s 489645 us (seq=0, len=0) # 8.68 [+0.08] SND: 1767864384 s 499651 us (seq=42, len=10) (USR +10005 us) # 8.68 [+0.00] USR-SND: count=4, avg=12119 us, min=10005 us, max=18460 us (Note that other delays are nowhere near the large 8ms tolerance.) One hypothesis is that the task is descheduled between taking the USR timestamp and sending the packet. Possibly in printing. Delay taking the timestamp closer to sendmsg, and delay printing until after sendmsg. With this change, failure rate is significantly lower in current runs. Link: https://lore.kernel.org/netdev/20260107110521.1aab55e9@kernel.org/ Suggested-by: Jakub Kicinski Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20260112163355.3510150-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/txtimestamp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index bcc14688661d..170be192f5c7 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -206,12 +206,10 @@ static void __print_timestamp(const char *name, struct timespec *cur, fprintf(stderr, "\n"); } -static void print_timestamp_usr(void) +static void record_timestamp_usr(void) { if (clock_gettime(CLOCK_REALTIME, &ts_usr)) error(1, errno, "clock_gettime"); - - __print_timestamp(" USR", &ts_usr, 0, 0); } static void print_timestamp(struct scm_timestamping *tss, int tstype, @@ -599,8 +597,6 @@ static void do_test(int family, unsigned int report_opt) fill_header_udp(buf + off, family == PF_INET); } - print_timestamp_usr(); - iov.iov_base = buf; iov.iov_len = total_len; @@ -655,10 +651,14 @@ static void do_test(int family, unsigned int report_opt) } + record_timestamp_usr(); + val = sendmsg(fd, &msg, 0); if (val != total_len) error(1, errno, "send"); + __print_timestamp(" USR", &ts_usr, 0, 0); + /* wait for all errors to be queued, else ACKs arrive OOO */ if (cfg_sleep_usec) usleep(cfg_sleep_usec); -- cgit v1.2.3 From fa5726692e4ca0d4e56d7cbd1b33126efd3f849e Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Mon, 12 Jan 2026 15:34:36 +0000 Subject: tools: ynl: render event op docs correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docs for YNL event ops currently render raw python structs. For example in: https://docs.kernel.org/netlink/specs/ethtool.html#cable-test-ntf event: {‘attributes’: [‘header’, ‘status’, ‘nest’], ‘__lineno__’: 2385} Handle event ops correctly and render their op attributes: event: attributes: [header, status] Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20260112153436.75495-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/lib/doc_generator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/pyynl/lib/doc_generator.py b/tools/net/ynl/pyynl/lib/doc_generator.py index 3a16b8eb01ca..8b922d8f89e8 100644 --- a/tools/net/ynl/pyynl/lib/doc_generator.py +++ b/tools/net/ynl/pyynl/lib/doc_generator.py @@ -166,13 +166,13 @@ class YnlDocGenerator: continue lines.append(self.fmt.rst_paragraph(self.fmt.bold(key), level + 1)) if key in ['request', 'reply']: - lines.append(self.parse_do_attributes(do_dict[key], level + 1) + "\n") + lines.append(self.parse_op_attributes(do_dict[key], level + 1) + "\n") else: lines.append(self.fmt.headroom(level + 2) + do_dict[key] + "\n") return "\n".join(lines) - def parse_do_attributes(self, attrs: Dict[str, Any], level: int = 0) -> str: + def parse_op_attributes(self, attrs: Dict[str, Any], level: int = 0) -> str: """Parse 'attributes' section""" if "attributes" not in attrs: return "" @@ -184,7 +184,7 @@ class YnlDocGenerator: def parse_operations(self, operations: List[Dict[str, Any]], namespace: str) -> str: """Parse operations block""" - preprocessed = ["name", "doc", "title", "do", "dump", "flags"] + preprocessed = ["name", "doc", "title", "do", "dump", "flags", "event"] linkable = ["fixed-header", "attribute-set"] lines = [] @@ -217,6 +217,9 @@ class YnlDocGenerator: if "dump" in operation: lines.append(self.fmt.rst_paragraph(":dump:", 0)) lines.append(self.parse_do(operation["dump"], 0)) + if "event" in operation: + lines.append(self.fmt.rst_paragraph(":event:", 0)) + lines.append(self.parse_op_attributes(operation["event"], 0)) # New line after fields lines.append("\n") -- cgit v1.2.3 From 8d3b6649499edd85e88b763e77bbce2ab016eb47 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 8 Dec 2025 17:57:27 -0800 Subject: perf util: Add BLAKE2s support Add BLAKE2s support to the perf utility library. The code is borrowed from the kernel. This will replace the use of SHA-1 in genelf.c. Signed-off-by: Eric Biggers Tested-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Fangrui Song Cc: Ingo Molnar Cc: James Clark Cc: Jason A. Donenfeld Cc: Jiri Olsa Cc: Mark Rutland Cc: Pablo Galindo Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/util.c | 78 +++++++++++++++++++++++- tools/perf/util/Build | 1 + tools/perf/util/blake2s.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/blake2s.h | 73 ++++++++++++++++++++++ 4 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 tools/perf/util/blake2s.c create mode 100644 tools/perf/util/blake2s.h (limited to 'tools') diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c index b273d287e164..efc3e4e4c6fa 100644 --- a/tools/perf/tests/util.c +++ b/tools/perf/tests/util.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "tests.h" +#include "util/blake2s.h" #include "util/debug.h" #include "util/sha1.h" @@ -59,8 +60,79 @@ static int test_sha1(void) return 0; } +/* Maximum data length tested by test_blake2s() */ +#define MAX_DATA_LEN 512 + +/* + * Hash length tested by test_blake2s(). BLAKE2s supports variable-length + * hashes. However, the only user of BLAKE2s in 'perf' uses 20-byte hashes, + * matching the length of the ELF build ID field. So that's the length we test. + */ +#define HASH_LEN 20 + +/* Test the implementation of the BLAKE2s hash algorithm. */ +static int test_blake2s(void) +{ + u8 data[MAX_DATA_LEN]; + u8 hash[HASH_LEN]; + u8 hash2[HASH_LEN]; + struct blake2s_ctx main_ctx; + /* + * This value was generated by the following Python code: + * + * import hashlib + * + * data = bytes(i % 256 for i in range(513)) + * h = hashlib.blake2s(digest_size=20) + * for i in range(513): + * h.update(hashlib.blake2s(data=data[:i], digest_size=20).digest()) + * print(h.hexdigest()) + */ + static const u8 expected_hash_of_hashes[20] = { + 0xef, 0x9b, 0x13, 0x98, 0x78, 0x8e, 0x74, 0x59, 0x9c, 0xd5, + 0x0c, 0xf0, 0x33, 0x97, 0x79, 0x3d, 0x3e, 0xd0, 0x95, 0xa6 + }; + size_t i; + + /* Generate MAX_DATA_LEN bytes of data. */ + for (i = 0; i < MAX_DATA_LEN; i++) + data[i] = i; + + blake2s_init(&main_ctx, sizeof(hash)); + for (i = 0; i <= MAX_DATA_LEN; i++) { + struct blake2s_ctx ctx; + + /* Compute the BLAKE2s hash of 'i' data bytes. */ + blake2s_init(&ctx, HASH_LEN); + blake2s_update(&ctx, data, i); + blake2s_final(&ctx, hash); + + /* Verify that multiple updates produce the same result. */ + blake2s_init(&ctx, HASH_LEN); + blake2s_update(&ctx, data, i / 2); + blake2s_update(&ctx, &data[i / 2], i - (i / 2)); + blake2s_final(&ctx, hash2); + TEST_ASSERT_VAL("inconsistent BLAKE2s hashes", + memcmp(hash, hash2, HASH_LEN) == 0); + + /* + * Pass the hash to another BLAKE2s context, so that we + * incrementally compute the hash of all the hashes. + */ + blake2s_update(&main_ctx, hash, HASH_LEN); + } + + /* Verify the hash of all the hashes. */ + blake2s_final(&main_ctx, hash); + TEST_ASSERT_VAL("wrong BLAKE2s hashes", + memcmp(hash, expected_hash_of_hashes, HASH_LEN) == 0); + return 0; +} + static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_unused) { + int ret; + TEST_ASSERT_VAL("empty string", test_strreplace(' ', "", "123", "")); TEST_ASSERT_VAL("no match", test_strreplace('5', "123", "4", "123")); TEST_ASSERT_VAL("replace 1", test_strreplace('3', "123", "4", "124")); @@ -68,7 +140,11 @@ static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_u TEST_ASSERT_VAL("replace long", test_strreplace('a', "abcabc", "longlong", "longlongbclonglongbc")); - return test_sha1(); + ret = test_sha1(); + if (ret != TEST_OK) + return ret; + + return test_blake2s(); } DEFINE_SUITE("util", util); diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 2bed6274e248..0c1cfcbed815 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -5,6 +5,7 @@ perf-util-y += arm64-frame-pointer-unwind-support.o perf-util-y += addr2line.o perf-util-y += addr_location.o perf-util-y += annotate.o +perf-util-y += blake2s.o perf-util-y += block-info.o perf-util-y += block-range.o perf-util-y += build-id.o diff --git a/tools/perf/util/blake2s.c b/tools/perf/util/blake2s.c new file mode 100644 index 000000000000..ce5d89a19376 --- /dev/null +++ b/tools/perf/util/blake2s.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * This is an implementation of the BLAKE2s hash and PRF functions. + * + * Information: https://blake2.net/ + */ + +#include "blake2s.h" +#include + +static inline u32 ror32(u32 v, int n) +{ + return (v >> n) | (v << (32 - n)); +} + +static inline void le32_to_cpu_array(u32 a[], size_t n) +{ + for (size_t i = 0; i < n; i++) + a[i] = le32_to_cpu((__force __le32)a[i]); +} + +static inline void cpu_to_le32_array(u32 a[], size_t n) +{ + for (size_t i = 0; i < n; i++) + a[i] = (__force u32)cpu_to_le32(a[i]); +} + +static const u8 blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; + +static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc) +{ + ctx->t[0] += inc; + ctx->t[1] += (ctx->t[0] < inc); +} + +static void blake2s_compress(struct blake2s_ctx *ctx, + const u8 *data, size_t nblocks, u32 inc) +{ + u32 m[16]; + u32 v[16]; + int i; + + while (nblocks > 0) { + blake2s_increment_counter(ctx, inc); + memcpy(m, data, BLAKE2S_BLOCK_SIZE); + le32_to_cpu_array(m, ARRAY_SIZE(m)); + memcpy(v, ctx->h, 32); + v[ 8] = BLAKE2S_IV0; + v[ 9] = BLAKE2S_IV1; + v[10] = BLAKE2S_IV2; + v[11] = BLAKE2S_IV3; + v[12] = BLAKE2S_IV4 ^ ctx->t[0]; + v[13] = BLAKE2S_IV5 ^ ctx->t[1]; + v[14] = BLAKE2S_IV6 ^ ctx->f[0]; + v[15] = BLAKE2S_IV7 ^ ctx->f[1]; + +#define G(r, i, a, b, c, d) do { \ + a += b + m[blake2s_sigma[r][2 * i + 0]]; \ + d = ror32(d ^ a, 16); \ + c += d; \ + b = ror32(b ^ c, 12); \ + a += b + m[blake2s_sigma[r][2 * i + 1]]; \ + d = ror32(d ^ a, 8); \ + c += d; \ + b = ror32(b ^ c, 7); \ +} while (0) + +#define ROUND(r) do { \ + G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[2], v[ 6], v[10], v[14]); \ + G(r, 3, v[3], v[ 7], v[11], v[15]); \ + G(r, 4, v[0], v[ 5], v[10], v[15]); \ + G(r, 5, v[1], v[ 6], v[11], v[12]); \ + G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ +} while (0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + +#undef G +#undef ROUND + + for (i = 0; i < 8; ++i) + ctx->h[i] ^= v[i] ^ v[i + 8]; + + data += BLAKE2S_BLOCK_SIZE; + --nblocks; + } +} + +static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx) +{ + ctx->f[0] = -1; +} + +void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen) +{ + const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen; + + if (unlikely(!inlen)) + return; + if (inlen > fill) { + memcpy(ctx->buf + ctx->buflen, in, fill); + blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE); + ctx->buflen = 0; + in += fill; + inlen -= fill; + } + if (inlen > BLAKE2S_BLOCK_SIZE) { + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); + + blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); + } + memcpy(ctx->buf + ctx->buflen, in, inlen); + ctx->buflen += inlen; +} + +void blake2s_final(struct blake2s_ctx *ctx, u8 *out) +{ + blake2s_set_lastblock(ctx); + memset(ctx->buf + ctx->buflen, 0, + BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */ + blake2s_compress(ctx, ctx->buf, 1, ctx->buflen); + cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h)); + memcpy(out, ctx->h, ctx->outlen); + memset(ctx, 0, sizeof(*ctx)); +} diff --git a/tools/perf/util/blake2s.h b/tools/perf/util/blake2s.h new file mode 100644 index 000000000000..a1fe81a4bea8 --- /dev/null +++ b/tools/perf/util/blake2s.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#ifndef _CRYPTO_BLAKE2S_H +#define _CRYPTO_BLAKE2S_H + +#include +#include + +#define BLAKE2S_BLOCK_SIZE 64 + +struct blake2s_ctx { + u32 h[8]; + u32 t[2]; + u32 f[2]; + u8 buf[BLAKE2S_BLOCK_SIZE]; + unsigned int buflen; + unsigned int outlen; +}; + +enum blake2s_iv { + BLAKE2S_IV0 = 0x6A09E667UL, + BLAKE2S_IV1 = 0xBB67AE85UL, + BLAKE2S_IV2 = 0x3C6EF372UL, + BLAKE2S_IV3 = 0xA54FF53AUL, + BLAKE2S_IV4 = 0x510E527FUL, + BLAKE2S_IV5 = 0x9B05688CUL, + BLAKE2S_IV6 = 0x1F83D9ABUL, + BLAKE2S_IV7 = 0x5BE0CD19UL, +}; + +static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen, + const void *key, size_t keylen) +{ + ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen); + ctx->h[1] = BLAKE2S_IV1; + ctx->h[2] = BLAKE2S_IV2; + ctx->h[3] = BLAKE2S_IV3; + ctx->h[4] = BLAKE2S_IV4; + ctx->h[5] = BLAKE2S_IV5; + ctx->h[6] = BLAKE2S_IV6; + ctx->h[7] = BLAKE2S_IV7; + ctx->t[0] = 0; + ctx->t[1] = 0; + ctx->f[0] = 0; + ctx->f[1] = 0; + ctx->buflen = 0; + ctx->outlen = outlen; + if (keylen) { + memcpy(ctx->buf, key, keylen); + memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen); + ctx->buflen = BLAKE2S_BLOCK_SIZE; + } +} + +static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen) +{ + __blake2s_init(ctx, outlen, NULL, 0); +} + +static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen, + const void *key, size_t keylen) +{ + __blake2s_init(ctx, outlen, key, keylen); +} + +void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen); + +void blake2s_final(struct blake2s_ctx *ctx, u8 *out); + +#endif /* _CRYPTO_BLAKE2S_H */ -- cgit v1.2.3 From f136fc491b2a48dbfcb98cac372303dc0e18f0c1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 8 Dec 2025 17:57:28 -0800 Subject: perf genelf: Switch from SHA-1 to BLAKE2s for build ID generation Recent patches [1] [2] added an implementation of SHA-1 to perf and made it be used for build ID generation. I had understood the choice of SHA-1, which is a legacy algorithm, to be for backwards compatibility. It turns out, though, that there's no backwards compatibility requirement here other than the size of the build ID field, which is fixed at 20 bytes. Not only did the hash algorithm already change (from MD5 to SHA-1), but the inputs to the hash changed too: from 'load_addr || code' to just 'code', and now again to 'code || symtab || strsym' [3]. Different linkers generate different build IDs, with the LLVM linker using BLAKE3 hashes for example [4]. Therefore, we might as well switch to a more modern algorithm. Let's go with BLAKE2s. It's faster than SHA-1, isn't cryptographically broken, is easier to implement than BLAKE3, and the kernel's implementation in lib/crypto/blake2s.c is easily borrowed. It also natively supports variable-length hashes, so it can directly produce the needed 20 bytes. Also make the following additional improvements: - Hash the three inputs incrementally, so they don't all have to be concatenated into one buffer. - Add tag/length prefixes to each of the three inputs, so that distinct input tuples reliably result in distinct hashes. [1] https://lore.kernel.org/linux-perf-users/20250521225307.743726-1-yuzhuo@google.com/ [2] https://lore.kernel.org/linux-perf-users/20250625202311.23244-1-ebiggers@kernel.org/ [3] https://lore.kernel.org/linux-perf-users/20251125080748.461014-1-namhyung@kernel.org/ [4] https://github.com/llvm/llvm-project/commit/d3e5b6f7539b86995aef6e2075c1edb3059385ce Signed-off-by: Eric Biggers Tested-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Fangrui Song Cc: Ingo Molnar Cc: James Clark Cc: Jason A. Donenfeld Cc: Jiri Olsa Cc: Mark Rutland Cc: Pablo Galindo Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/genelf.c | 58 +++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 30 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c index a1cd5196f4ec..14882def9704 100644 --- a/tools/perf/util/genelf.c +++ b/tools/perf/util/genelf.c @@ -18,8 +18,8 @@ #include #endif +#include "blake2s.h" #include "genelf.h" -#include "sha1.h" #include "../util/jitdump.h" #include @@ -51,7 +51,7 @@ static char shd_string_table[] = { static struct buildid_note { Elf_Note desc; /* descsz: size of build-id, must be multiple of 4 */ char name[4]; /* GNU\0 */ - u8 build_id[SHA1_DIGEST_SIZE]; + u8 build_id[20]; } bnote; static Elf_Sym symtab[]={ @@ -152,9 +152,28 @@ jit_add_eh_frame_info(Elf *e, void* unwinding, uint64_t unwinding_header_size, return 0; } +enum { + TAG_CODE = 0, + TAG_SYMTAB = 1, + TAG_STRSYM = 2, +}; + +/* + * Update the hash using the given data, also prepending a (tag, len) prefix to + * ensure that distinct input tuples reliably result in distinct hashes. + */ +static void blake2s_update_tagged(struct blake2s_ctx *ctx, int tag, + const void *data, size_t len) +{ + u64 prefix = ((u64)tag << 56) | len; + + blake2s_update(ctx, (const u8 *)&prefix, sizeof(prefix)); + blake2s_update(ctx, data, len); +} + /* * fd: file descriptor open for writing for the output file - * load_addr: code load address (could be zero, just used for buildid) + * load_addr: code load address (could be zero) * sym: function name (for native code - used as the symbol) * code: the native code * csize: the code size in bytes @@ -173,8 +192,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym, Elf_Shdr *shdr; uint64_t eh_frame_base_offset; char *strsym = NULL; - void *build_id_data = NULL, *tmp; - int build_id_data_len; + struct blake2s_ctx ctx; int symlen; int retval = -1; @@ -253,13 +271,8 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym, shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC; shdr->sh_entsize = 0; - build_id_data = malloc(csize); - if (build_id_data == NULL) { - warnx("cannot allocate build-id data"); - goto error; - } - memcpy(build_id_data, code, csize); - build_id_data_len = csize; + blake2s_init(&ctx, sizeof(bnote.build_id)); + blake2s_update_tagged(&ctx, TAG_CODE, code, csize); /* * Setup .eh_frame_hdr and .eh_frame @@ -344,14 +357,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym, shdr->sh_entsize = sizeof(Elf_Sym); shdr->sh_link = unwinding ? 6 : 4; /* index of .strtab section */ - tmp = realloc(build_id_data, build_id_data_len + sizeof(symtab)); - if (tmp == NULL) { - warnx("cannot allocate build-id data"); - goto error; - } - memcpy(tmp + build_id_data_len, symtab, sizeof(symtab)); - build_id_data = tmp; - build_id_data_len += sizeof(symtab); + blake2s_update_tagged(&ctx, TAG_SYMTAB, symtab, sizeof(symtab)); /* * setup symbols string table @@ -395,14 +401,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym, shdr->sh_flags = 0; shdr->sh_entsize = 0; - tmp = realloc(build_id_data, build_id_data_len + symlen); - if (tmp == NULL) { - warnx("cannot allocate build-id data"); - goto error; - } - memcpy(tmp + build_id_data_len, strsym, symlen); - build_id_data = tmp; - build_id_data_len += symlen; + blake2s_update_tagged(&ctx, TAG_STRSYM, strsym, symlen); /* * setup build-id section @@ -422,7 +421,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym, /* * build-id generation */ - sha1(build_id_data, build_id_data_len, bnote.build_id); + blake2s_final(&ctx, bnote.build_id); bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */ bnote.desc.descsz = sizeof(bnote.build_id); bnote.desc.type = NT_GNU_BUILD_ID; @@ -467,7 +466,6 @@ error: (void)elf_end(e); free(strsym); - free(build_id_data); return retval; } -- cgit v1.2.3 From e35dd81017011be0fb0cbb2ae80a6bc24962f0f7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 8 Dec 2025 17:57:29 -0800 Subject: perf util: Remove SHA-1 code Now that the SHA-1 code is no longer used, remove it. Signed-off-by: Eric Biggers Tested-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Fangrui Song Cc: Ingo Molnar Cc: James Clark Cc: Jason A. Donenfeld Cc: Jiri Olsa Cc: Mark Rutland Cc: Pablo Galindo Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/util.c | 49 ------------------------- tools/perf/util/Build | 1 - tools/perf/util/sha1.c | 97 ------------------------------------------------- tools/perf/util/sha1.h | 6 --- 4 files changed, 153 deletions(-) delete mode 100644 tools/perf/util/sha1.c delete mode 100644 tools/perf/util/sha1.h (limited to 'tools') diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c index efc3e4e4c6fa..bf2c5b133884 100644 --- a/tools/perf/tests/util.c +++ b/tools/perf/tests/util.c @@ -2,7 +2,6 @@ #include "tests.h" #include "util/blake2s.h" #include "util/debug.h" -#include "util/sha1.h" #include #include @@ -18,48 +17,6 @@ static int test_strreplace(char needle, const char *haystack, return ret == 0; } -#define MAX_LEN 512 - -/* Test sha1() for all lengths from 0 to MAX_LEN inclusively. */ -static int test_sha1(void) -{ - u8 data[MAX_LEN]; - size_t digests_size = (MAX_LEN + 1) * SHA1_DIGEST_SIZE; - u8 *digests; - u8 digest_of_digests[SHA1_DIGEST_SIZE]; - /* - * The correctness of this value was verified by running this test with - * sha1() replaced by OpenSSL's SHA1(). - */ - static const u8 expected_digest_of_digests[SHA1_DIGEST_SIZE] = { - 0x74, 0xcd, 0x4c, 0xb9, 0xd8, 0xa6, 0xd5, 0x95, 0x22, 0x8b, - 0x7e, 0xd6, 0x8b, 0x7e, 0x46, 0x95, 0x31, 0x9b, 0xa2, 0x43, - }; - size_t i; - - digests = malloc(digests_size); - TEST_ASSERT_VAL("failed to allocate digests", digests != NULL); - - /* Generate MAX_LEN bytes of data. */ - for (i = 0; i < MAX_LEN; i++) - data[i] = i; - - /* Calculate a SHA-1 for each length 0 through MAX_LEN inclusively. */ - for (i = 0; i <= MAX_LEN; i++) - sha1(data, i, &digests[i * SHA1_DIGEST_SIZE]); - - /* Calculate digest of all digests calculated above. */ - sha1(digests, digests_size, digest_of_digests); - - free(digests); - - /* Check for the expected result. */ - TEST_ASSERT_VAL("wrong output from sha1()", - memcmp(digest_of_digests, expected_digest_of_digests, - SHA1_DIGEST_SIZE) == 0); - return 0; -} - /* Maximum data length tested by test_blake2s() */ #define MAX_DATA_LEN 512 @@ -131,8 +88,6 @@ static int test_blake2s(void) static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_unused) { - int ret; - TEST_ASSERT_VAL("empty string", test_strreplace(' ', "", "123", "")); TEST_ASSERT_VAL("no match", test_strreplace('5', "123", "4", "123")); TEST_ASSERT_VAL("replace 1", test_strreplace('3', "123", "4", "124")); @@ -140,10 +95,6 @@ static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_u TEST_ASSERT_VAL("replace long", test_strreplace('a', "abcabc", "longlong", "longlongbclonglongbc")); - ret = test_sha1(); - if (ret != TEST_OK) - return ret; - return test_blake2s(); } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 0c1cfcbed815..248ad3ac64da 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -44,7 +44,6 @@ perf-util-y += rbtree.o perf-util-y += libstring.o perf-util-y += bitmap.o perf-util-y += hweight.o -perf-util-y += sha1.o perf-util-y += smt.o perf-util-y += strbuf.o perf-util-y += string.o diff --git a/tools/perf/util/sha1.c b/tools/perf/util/sha1.c deleted file mode 100644 index 7032fa4ff3fd..000000000000 --- a/tools/perf/util/sha1.c +++ /dev/null @@ -1,97 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * SHA-1 message digest algorithm - * - * Copyright 2025 Google LLC - */ -#include -#include -#include -#include - -#include "sha1.h" - -#define SHA1_BLOCK_SIZE 64 - -static const u32 sha1_K[4] = { 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 }; - -#define SHA1_ROUND(i, a, b, c, d, e) \ - do { \ - if ((i) >= 16) \ - w[i] = rol32(w[(i) - 16] ^ w[(i) - 14] ^ w[(i) - 8] ^ \ - w[(i) - 3], \ - 1); \ - e += w[i] + rol32(a, 5) + sha1_K[(i) / 20]; \ - if ((i) < 20) \ - e += (b & (c ^ d)) ^ d; \ - else if ((i) < 40 || (i) >= 60) \ - e += b ^ c ^ d; \ - else \ - e += (c & d) ^ (b & (c ^ d)); \ - b = rol32(b, 30); \ - /* The new (a, b, c, d, e) is the old (e, a, b, c, d). */ \ - } while (0) - -#define SHA1_5ROUNDS(i) \ - do { \ - SHA1_ROUND((i) + 0, a, b, c, d, e); \ - SHA1_ROUND((i) + 1, e, a, b, c, d); \ - SHA1_ROUND((i) + 2, d, e, a, b, c); \ - SHA1_ROUND((i) + 3, c, d, e, a, b); \ - SHA1_ROUND((i) + 4, b, c, d, e, a); \ - } while (0) - -#define SHA1_20ROUNDS(i) \ - do { \ - SHA1_5ROUNDS((i) + 0); \ - SHA1_5ROUNDS((i) + 5); \ - SHA1_5ROUNDS((i) + 10); \ - SHA1_5ROUNDS((i) + 15); \ - } while (0) - -static void sha1_blocks(u32 h[5], const u8 *data, size_t nblocks) -{ - while (nblocks--) { - u32 a = h[0]; - u32 b = h[1]; - u32 c = h[2]; - u32 d = h[3]; - u32 e = h[4]; - u32 w[80]; - - for (int i = 0; i < 16; i++) - w[i] = get_unaligned_be32(&data[i * 4]); - SHA1_20ROUNDS(0); - SHA1_20ROUNDS(20); - SHA1_20ROUNDS(40); - SHA1_20ROUNDS(60); - - h[0] += a; - h[1] += b; - h[2] += c; - h[3] += d; - h[4] += e; - data += SHA1_BLOCK_SIZE; - } -} - -/* Calculate the SHA-1 message digest of the given data. */ -void sha1(const void *data, size_t len, u8 out[SHA1_DIGEST_SIZE]) -{ - u32 h[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, - 0xC3D2E1F0 }; - u8 final_data[2 * SHA1_BLOCK_SIZE] = { 0 }; - size_t final_len = len % SHA1_BLOCK_SIZE; - - sha1_blocks(h, data, len / SHA1_BLOCK_SIZE); - - memcpy(final_data, data + len - final_len, final_len); - final_data[final_len] = 0x80; - final_len = round_up(final_len + 9, SHA1_BLOCK_SIZE); - put_unaligned_be64((u64)len * 8, &final_data[final_len - 8]); - - sha1_blocks(h, final_data, final_len / SHA1_BLOCK_SIZE); - - for (int i = 0; i < 5; i++) - put_unaligned_be32(h[i], &out[i * 4]); -} diff --git a/tools/perf/util/sha1.h b/tools/perf/util/sha1.h deleted file mode 100644 index e92c9966e1d5..000000000000 --- a/tools/perf/util/sha1.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#include - -#define SHA1_DIGEST_SIZE 20 - -void sha1(const void *data, size_t len, u8 out[SHA1_DIGEST_SIZE]); -- cgit v1.2.3 From 9f8f5edc79b6f22d0b4510d08b6a9c6e7f2c96e5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 17 Dec 2025 10:39:27 -0800 Subject: perf inject: Keep build-ID data if no option is used The keep_feat() determines which header features will be kept or discarded. Usually 'perf inject' will add build-IDs based on -b, -B or other related options. But it lose build-ID when none of those options are used. This is meaningful only when --buildid-mmap is not used. The following example shows the impact of this change. $ perf record --no-buildid-mmap true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.037 MB perf.data (5 samples) ] $ perf inject -i perf.data -o perf.data.inject $ perf buildid-list -i perf.data 08cccc2a9388d5247ccb3e864f3063b975b0a15d /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 fd5c4d5673256cd6bda51725dba048dabb0f854e [kernel.kallsyms] 97a36ce1140071be5c36b147fa0bed173e05a602 [vdso] $ perf buildid-list -i perf.data.inject 97a36ce1140071be5c36b147fa0bed173e05a602 [vdso] With this change, perf.data.inject would show the same list (of course, you need to run perf inject again). Reported-by: Gabriel Marin Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index aa7be4fb5838..6080afec537d 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2022,7 +2022,7 @@ static int save_section_info(struct perf_inject *inject) return perf_header__process_sections(header, fd, inject, save_section_info_cb); } -static bool keep_feat(int feat) +static bool keep_feat(struct perf_inject *inject, int feat) { switch (feat) { /* Keep original information that describes the machine or software */ @@ -2050,6 +2050,7 @@ static bool keep_feat(int feat) return true; /* Information that can be updated */ case HEADER_BUILD_ID: + return inject->build_id_style == BID_RWS__NONE; case HEADER_CMDLINE: case HEADER_EVENT_DESC: case HEADER_BRANCH_STACK: @@ -2108,7 +2109,7 @@ static int feat_copy_cb(struct feat_copier *fc, int feat, struct feat_writer *fw int ret; if (!inject->secs[feat].offset || - !keep_feat(feat)) + !keep_feat(inject, feat)) return 0; ret = feat_copy(inject, feat, fw); -- cgit v1.2.3 From b2629e7846e35dbf12de6d7b8e81f0049f6a50ea Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Dec 2025 17:18:17 -0800 Subject: perf test: Skip dlfilter test for build failures For some reason, it may fail to build the dlfilter. Let's skip the test as it's not an error in the perf. This can happen when you run the perf test without source code or in a different directory. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/script_dlfilter.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/script_dlfilter.sh b/tools/perf/tests/shell/script_dlfilter.sh index 45c97d4a7d5f..7895ab0309b2 100755 --- a/tools/perf/tests/shell/script_dlfilter.sh +++ b/tools/perf/tests/shell/script_dlfilter.sh @@ -70,15 +70,15 @@ test_dlfilter() { # Build the dlfilter if ! cc -c -I tools/perf/include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o" then - echo "Basic --dlfilter test [Failed to build dlfilter object]" - err=1 + echo "Basic --dlfilter test [Skip - failed to build dlfilter object]" + err=2 return fi if ! cc -shared -o "${dlfilter_so}" "${dlfilter_so}.o" then - echo "Basic --dlfilter test [Failed to link dlfilter shared object]" - err=1 + echo "Basic --dlfilter test [Skip - failed to link dlfilter shared object]" + err=2 return fi -- cgit v1.2.3 From f552878a720bf765cc1616ee4a4e243cc03e4b27 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Dec 2025 17:18:18 -0800 Subject: perf test: Use shelldir to refer perf source location It uses tools/perf/include which assumes it's running from the root of the linux kernel source tree. But you can run perf from other places like tools/perf, then the include path won't match. We can use the shelldir variable to locate the test script in the tree. $ cd tools/perf $ ./perf test dlfilter 63: dlfilter C API : Ok 101: perf script --dlfilter tests : Ok Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/script_dlfilter.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/script_dlfilter.sh b/tools/perf/tests/shell/script_dlfilter.sh index 7895ab0309b2..aaed92bb7828 100755 --- a/tools/perf/tests/shell/script_dlfilter.sh +++ b/tools/perf/tests/shell/script_dlfilter.sh @@ -68,7 +68,7 @@ test_dlfilter() { fi # Build the dlfilter - if ! cc -c -I tools/perf/include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o" + if ! cc -c -I ${shelldir}/../../include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o" then echo "Basic --dlfilter test [Skip - failed to build dlfilter object]" err=2 -- cgit v1.2.3 From 1c89bc1b95fa9058f3e7cd37f1142939261417d5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Dec 2025 17:18:19 -0800 Subject: perf test: Do not skip when some metrics tests succeeded I think the return value of SKIP (2) should be used when it skipped the entire test suite rather than a few of them. While the FAIL should be reserved if any of test failed. $ perf test -vv 110 110: perf all metrics test: --- start --- test child forked, pid 2496399 Testing tma_core_bound Testing tma_info_core_ilp Testing tma_info_memory_l2mpki Testing tma_memory_bound Testing tma_bottleneck_irregular_overhead Testing tma_bottleneck_mispredictions Testing tma_info_bad_spec_branch_misprediction_cost Testing tma_info_bad_spec_ipmisp_cond_ntaken Testing tma_info_bad_spec_ipmisp_cond_taken Testing tma_info_bad_spec_ipmisp_indirect Testing tma_info_bad_spec_ipmisp_ret Testing tma_info_bad_spec_ipmispredict Testing tma_info_branches_callret Testing tma_info_branches_cond_nt Testing tma_info_branches_cond_tk Testing tma_info_branches_jump Testing tma_info_branches_other_branches Testing tma_branch_mispredicts Testing tma_clears_resteers Testing tma_machine_clears Testing tma_mispredicts_resteers Testing tma_bottleneck_big_code Testing tma_icache_misses Testing tma_itlb_misses Testing tma_unknown_branches Testing tma_info_bad_spec_spec_clears_ratio Testing tma_other_mispredicts Testing tma_branch_instructions Testing tma_info_frontend_tbpc Testing tma_info_inst_mix_bptkbranch Testing tma_info_inst_mix_ipbranch Testing tma_info_inst_mix_ipcall Testing tma_info_inst_mix_iptb Testing tma_info_system_ipfarbranch Testing tma_info_thread_uptb Testing tma_bottleneck_branching_overhead Testing tma_nop_instructions Testing tma_bottleneck_compute_bound_est Testing tma_divider Testing tma_ports_utilized_3m Testing tma_bottleneck_instruction_fetch_bw Testing tma_frontend_bound Testing tma_assists Testing tma_other_nukes Testing tma_serializing_operation Testing tma_bottleneck_data_cache_memory_bandwidth Testing tma_fb_full Testing tma_mem_bandwidth Testing tma_sq_full Testing tma_bottleneck_data_cache_memory_latency Testing tma_l1_latency_dependency Testing tma_l2_bound Testing tma_l3_hit_latency Testing tma_mem_latency Testing tma_store_latency Testing tma_bottleneck_memory_synchronization Testing tma_contested_accesses Testing tma_data_sharing Testing tma_false_sharing Testing tma_bottleneck_memory_data_tlbs Testing tma_dtlb_load Testing tma_dtlb_store Testing tma_backend_bound Testing tma_bottleneck_other_bottlenecks Testing tma_bottleneck_useful_work Testing tma_retiring Testing tma_info_memory_fb_hpki Testing tma_info_memory_l1mpki Testing tma_info_memory_l1mpki_load Testing tma_info_memory_l2hpki_all Testing tma_info_memory_l2hpki_load Testing tma_info_memory_l2mpki_all Testing tma_info_memory_l2mpki_load Testing tma_l1_bound Testing tma_l3_bound Testing tma_info_memory_l2mpki_rfo Testing tma_fp_scalar Testing tma_fp_vector Testing tma_fp_vector_128b Testing tma_fp_vector_256b Testing tma_fp_vector_512b Testing tma_port_0 Testing tma_x87_use Testing tma_info_botlnk_l0_core_bound_likely Testing tma_info_core_fp_arith_utilization Testing tma_info_pipeline_execute Testing tma_info_system_gflops Testing tma_info_thread_execute_per_issue Testing tma_dsb Testing tma_info_botlnk_l2_dsb_bandwidth Testing tma_info_frontend_dsb_coverage Testing tma_decoder0_alone Testing tma_dsb_switches Testing tma_info_botlnk_l2_dsb_misses Testing tma_info_frontend_dsb_switch_cost Testing tma_info_frontend_ipdsb_miss_ret Testing tma_mite Testing tma_mite_4wide Testing CPUs_utilized Testing backend_cycles_idle [Ignored backend_cycles_idle] failed but as a Default metric this can be expected Performance counter stats for 'perf test -w noploop': cpu-cycles:u stalled-cycles-backend:u 1.014051473 seconds time elapsed 1.005718000 seconds user 0.008013000 seconds sys Testing branch_frequency Testing branch_miss_rate Testing cs_per_second Testing cycles_frequency Testing frontend_cycles_idle [Ignored frontend_cycles_idle] failed but as a Default metric this can be expected Performance counter stats for 'perf test -w noploop': cpu-cycles:u stalled-cycles-frontend:u 1.012813656 seconds time elapsed 1.004603000 seconds user 0.008004000 seconds sys Testing insn_per_cycle Testing migrations_per_second Testing page_faults_per_second Testing stalled_cycles_per_instruction [Ignored stalled_cycles_per_instruction] failed but as a Default metric this can be expected Error: No supported events found. The stalled-cycles-backend:u event is not supported. Testing tma_bad_speculation Testing l1d_miss_rate Testing llc_miss_rate Testing dtlb_miss_rate Testing itlb_miss_rate [Ignored itlb_miss_rate] failed but as a Default metric this can be expected Performance counter stats for 'perf test -w noploop': iTLB-loads:u 3,097 iTLB-load-misses:u 1.012766732 seconds time elapsed 1.004318000 seconds user 0.008002000 seconds sys Testing l1i_miss_rate [Ignored l1i_miss_rate] failed but as a Default metric this can be expected Performance counter stats for 'perf test -w noploop': L1-icache-load-misses:u L1-icache-loads:u 1.013606395 seconds time elapsed 1.001371000 seconds user 0.011968000 seconds sys Testing l1_prefetch_miss_rate [Ignored l1_prefetch_miss_rate] failed but as a Default metric this can be expected Error: No supported events found. The L1-dcache-prefetches:u event is not supported. Testing tma_info_botlnk_l2_ic_misses Testing tma_info_frontend_fetch_upc Testing tma_info_frontend_icache_miss_latency Testing tma_info_frontend_ipunknown_branch Testing tma_info_frontend_lsd_coverage Testing tma_info_memory_tlb_code_stlb_mpki Testing tma_info_pipeline_fetch_dsb Testing tma_info_pipeline_fetch_lsd Testing tma_info_pipeline_fetch_mite Testing tma_info_pipeline_fetch_ms Testing tma_fetch_bandwidth Testing tma_lsd Testing tma_branch_resteers Testing tma_code_l2_hit Testing tma_code_l2_miss Testing tma_code_stlb_hit Testing tma_code_stlb_miss Testing tma_code_stlb_miss_2m Testing tma_code_stlb_miss_4k Testing tma_lcp Testing tma_ms_switches Testing tma_info_core_flopc Testing tma_info_inst_mix_iparith Testing tma_info_inst_mix_iparith_avx128 Testing tma_info_inst_mix_iparith_avx256 Testing tma_info_inst_mix_iparith_avx512 Testing tma_info_inst_mix_iparith_scalar_dp Testing tma_info_inst_mix_iparith_scalar_sp Testing tma_info_inst_mix_ipflop Testing tma_info_inst_mix_ippause Testing tma_fetch_latency Testing tma_fp_arith Testing tma_fp_assists Testing tma_info_system_cpu_utilization Testing tma_info_system_dram_bw_use [Skipped tma_info_system_dram_bw_use] Not supported events Performance counter stats for 'perf test -w noploop': UNC_ARB_TRK_REQUESTS.ALL:u UNC_ARB_COH_TRK_REQUESTS.ALL:u 1,013,554,749 duration_time 1.013527265 seconds time elapsed 1.005417000 seconds user 0.008011000 seconds sys Testing tma_info_frontend_l2mpki_code Testing tma_info_frontend_l2mpki_code_all Testing tma_info_inst_mix_ipload Testing tma_info_inst_mix_ipstore Testing tma_info_memory_latency_load_l2_miss_latency Testing tma_lock_latency Testing tma_info_memory_core_l1d_cache_fill_bw_2t Testing tma_info_memory_core_l2_cache_fill_bw_2t Testing tma_info_memory_core_l3_cache_access_bw_2t Testing tma_info_memory_core_l3_cache_fill_bw_2t Testing tma_info_memory_l1d_cache_fill_bw Testing tma_info_memory_l2_cache_fill_bw Testing tma_info_memory_l3_cache_access_bw Testing tma_info_memory_l3_cache_fill_bw Testing tma_info_memory_l3mpki Testing tma_info_memory_load_miss_real_latency Testing tma_info_memory_mix_bus_lock_pki Testing tma_info_memory_mix_uc_load_pki Testing tma_info_memory_mlp Testing tma_info_memory_tlb_load_stlb_mpki Testing tma_info_memory_tlb_page_walks_utilization Testing tma_info_memory_tlb_store_stlb_mpki Testing tma_info_system_mem_parallel_reads [Skipped tma_info_system_mem_parallel_reads] Not supported events Performance counter stats for 'perf test -w noploop': UNC_ARB_DAT_OCCUPANCY.RD:u UNC_ARB_DAT_OCCUPANCY.RD/cmask=1/ 1.013354884 seconds time elapsed 1.009239000 seconds user 0.004004000 seconds sys Testing tma_info_system_mem_read_latency [Skipped tma_info_system_mem_read_latency] Not supported events Performance counter stats for 'perf test -w noploop': UNC_ARB_DAT_OCCUPANCY.RD:u UNC_ARB_TRK_OCCUPANCY.RD UNC_ARB_TRK_REQUESTS.RD 1.012882143 seconds time elapsed 1.004600000 seconds user 0.008036000 seconds sys Testing tma_info_thread_cpi Testing tma_streaming_stores Testing tma_dram_bound Testing tma_store_bound Testing tma_l2_hit_latency Testing tma_load_stlb_hit Testing tma_load_stlb_miss Testing tma_load_stlb_miss_1g Testing tma_load_stlb_miss_2m Testing tma_load_stlb_miss_4k Testing tma_store_stlb_hit Testing tma_store_stlb_miss Testing tma_store_stlb_miss_1g Testing tma_store_stlb_miss_2m Testing tma_store_stlb_miss_4k Testing tma_info_memory_latency_data_l2_mlp Testing tma_info_memory_latency_load_l2_mlp Testing tma_info_pipeline_ipassist Testing tma_microcode_sequencer Testing tma_ms Testing tma_info_system_kernel_cpi [Failed tma_info_system_kernel_cpi] Metric contains missing events Error: No supported events found. Access to performance monitoring and observability operations is limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open access to performance monitoring and observability operations for processes without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. More information can be found at 'Perf events and tool security' document: https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html perf_event_paranoid setting is 2: -1: Allow use of (almost) all events by all users Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK >= 0: Disallow raw and ftrace function tracepoint access >= 1: Disallow CPU event access >= 2: Disallow kernel profiling To make the adjusted perf_event_paranoid setting permanent preserve it in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = ) Testing tma_info_system_kernel_utilization [Failed tma_info_system_kernel_utilization] Metric contains missing events Error: No supported events found. Access to performance monitoring and observability operations is limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open access to performance monitoring and observability operations for processes without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. More information can be found at 'Perf events and tool security' document: https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html perf_event_paranoid setting is 2: -1: Allow use of (almost) all events by all users Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK >= 0: Disallow raw and ftrace function tracepoint access >= 1: Disallow CPU event access >= 2: Disallow kernel profiling To make the adjusted perf_event_paranoid setting permanent preserve it in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = ) Testing tma_info_pipeline_retire Testing tma_info_thread_clks Testing tma_info_thread_uoppi Testing tma_memory_operations Testing tma_other_light_ops Testing tma_ports_utilization Testing tma_ports_utilized_0 Testing tma_ports_utilized_1 Testing tma_ports_utilized_2 Testing C10_Pkg_Residency [Failed C10_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c10-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c10-residency/u) in per-thread mode, enable system wide with '-a'. Testing C2_Pkg_Residency [Failed C2_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c2-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c2-residency/u) in per-thread mode, enable system wide with '-a'. Testing C3_Pkg_Residency [Failed C3_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { msr/tsc/, cstate_pkg/c3-residency/ } Error: No supported events found. Invalid event (msr/tsc/u) in per-thread mode, enable system wide with '-a'. Testing C6_Core_Residency [Failed C6_Core_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_core/c6-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_core/c6-residency/u) in per-thread mode, enable system wide with '-a'. Testing C6_Pkg_Residency [Failed C6_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c6-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c6-residency/u) in per-thread mode, enable system wide with '-a'. Testing C7_Core_Residency [Failed C7_Core_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_core/c7-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_core/c7-residency/u) in per-thread mode, enable system wide with '-a'. Testing C7_Pkg_Residency [Failed C7_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c7-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c7-residency/u) in per-thread mode, enable system wide with '-a'. Testing C8_Pkg_Residency [Failed C8_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c8-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c8-residency/u) in per-thread mode, enable system wide with '-a'. Testing C9_Pkg_Residency [Failed C9_Pkg_Residency] Metric contains missing events WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c9-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c9-residency/u) in per-thread mode, enable system wide with '-a'. Testing tma_info_core_epc Testing tma_info_system_core_frequency Testing tma_info_system_power [Skipped tma_info_system_power] Not supported events Performance counter stats for 'perf test -w noploop': Joules power/energy-pkg/u 1,013,238,256 duration_time 1.013223072 seconds time elapsed 0.995924000 seconds user 0.011903000 seconds sys Testing tma_info_system_power_license0_utilization Testing tma_info_system_power_license1_utilization Testing tma_info_system_power_license2_utilization Testing tma_info_system_turbo_utilization Testing tma_info_inst_mix_ipswpf Testing tma_info_memory_prefetches_useless_hwpf Testing tma_info_core_coreipc Testing tma_info_thread_ipc Testing tma_heavy_operations Testing tma_light_operations Testing tma_info_core_core_clks Testing tma_info_system_smt_2t_utilization Testing tma_info_thread_slots_utilization Testing UNCORE_FREQ [Skipped UNCORE_FREQ] Not supported events Performance counter stats for 'perf test -w noploop': UNC_CLOCK.SOCKET:u 1,015,993,466 duration_time 1.015949387 seconds time elapsed 1.007676000 seconds user 0.008029000 seconds sys Testing tma_info_system_socket_clks [Failed tma_info_system_socket_clks] Metric contains missing events Error: No supported events found. Invalid event (UNC_CLOCK.SOCKET:u) in per-thread mode, enable system wide with '-a'. Testing tma_info_inst_mix_instructions Testing tma_info_system_cpus_utilized Testing tma_info_system_mux Testing tma_info_system_time Testing tma_info_thread_slots Testing tma_few_uops_instructions Testing tma_4k_aliasing Testing tma_cisc Testing tma_fp_divider Testing tma_int_divider Testing tma_slow_pause Testing tma_split_loads Testing tma_split_stores Testing tma_store_fwd_blk Testing tma_alu_op_utilization Testing tma_load_op_utilization Testing tma_mixing_vectors Testing tma_store_op_utilization Testing tma_port_1 Testing tma_port_5 Testing tma_port_6 Testing smi_cycles [Skipped smi_cycles] Not supported events Performance counter stats for 'perf test -w noploop': msr/smi/u msr/aperf/u 3,965,789,327 cycles:u 1.012779591 seconds time elapsed 1.004579000 seconds user 0.007972000 seconds sys Testing smi_num [Failed smi_num] Metric contains missing events Error: No supported events found. Invalid event (msr/smi/u) in per-thread mode, enable system wide with '-a'. Testing tsx_aborted_cycles Testing tsx_cycles_per_elision Testing tsx_cycles_per_transaction Testing tsx_transactional_cycles ---- end(-1) ---- 110: perf all metrics test : FAILED! Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_all_metrics.sh | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat_all_metrics.sh b/tools/perf/tests/shell/stat_all_metrics.sh index 3dabb39c7cc8..b582d23f28c9 100755 --- a/tools/perf/tests/shell/stat_all_metrics.sh +++ b/tools/perf/tests/shell/stat_all_metrics.sh @@ -15,7 +15,8 @@ then test_prog="perf test -w noploop" fi -err=0 +skip=0 +err=3 for m in $(perf list --raw-dump metrics); do echo "Testing $m" result=$(perf stat -M "$m" $system_wide_flag -- $test_prog 2>&1) @@ -23,6 +24,10 @@ for m in $(perf list --raw-dump metrics); do if [[ $result_err -eq 0 && "$result" =~ ${m:0:50} ]] then # No error result and metric shown. + if [[ "$err" -ne 1 ]] + then + err=0 + fi continue fi if [[ "$result" =~ "Cannot resolve IDs for" || "$result" =~ "No supported events found" ]] @@ -44,7 +49,7 @@ for m in $(perf list --raw-dump metrics); do echo $result if [[ $err -eq 0 ]] then - err=2 # Skip + skip=1 fi continue elif [[ "$result" =~ "in per-thread mode, enable system wide" ]] @@ -53,7 +58,7 @@ for m in $(perf list --raw-dump metrics); do echo $result if [[ $err -eq 0 ]] then - err=2 # Skip + skip=1 fi continue elif [[ "$result" =~ "" ]] @@ -68,7 +73,7 @@ for m in $(perf list --raw-dump metrics); do echo $result if [[ $err -eq 0 ]] then - err=2 # Skip + skip=1 fi continue elif [[ "$result" =~ "" ]] @@ -77,7 +82,7 @@ for m in $(perf list --raw-dump metrics); do echo $result if [[ $err -eq 0 ]] then - err=2 # Skip + skip=1 fi continue elif [[ "$result" =~ "FP_ARITH" || "$result" =~ "AMX" ]] @@ -86,7 +91,7 @@ for m in $(perf list --raw-dump metrics); do echo $result if [[ $err -eq 0 ]] then - err=2 # Skip + skip=1 fi continue elif [[ "$result" =~ "PMM" ]] @@ -95,7 +100,7 @@ for m in $(perf list --raw-dump metrics); do echo $result if [[ $err -eq 0 ]] then - err=2 # Skip + skip=1 fi continue fi @@ -106,6 +111,10 @@ for m in $(perf list --raw-dump metrics); do if [[ $result_err -eq 0 && "$result" =~ ${m:0:50} ]] then # No error result and metric shown. + if [[ "$err" -ne 1 ]] + then + err=0 + fi continue fi echo "[Failed $m] has non-zero error '$result_err' or not printed in:" @@ -113,4 +122,10 @@ for m in $(perf list --raw-dump metrics); do err=1 done +# return SKIP only if no success returned +if [[ "$err" -eq 3 && "$skip" -eq 1 ]] +then + err=2 +fi + exit "$err" -- cgit v1.2.3 From 84010f9bcf5389717b4ad02b6f2124ff59413bdf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Dec 2025 17:18:20 -0800 Subject: perf test: Do not skip when some metric-group tests succeed I think the return value of SKIP (2) should be used when it skipped the entire test suite rather than a few of them. While the FAIL should be reserved if any of test failed. $ perf test -vv 109 109: perf all metricgroups test: --- start --- test child forked, pid 2493003 Testing Backend Testing Bad Testing BadSpec Testing BigFootprint Testing BrMispredicts Testing Branches Testing BvBC Testing BvBO Testing BvCB Testing BvFB Testing BvIO Testing BvMB Testing BvML Testing BvMP Testing BvMS Testing BvMT Testing BvOB Testing BvUW Testing CacheHits Testing CacheMisses Testing CodeGen Testing Compute Testing Cor Testing DSB Testing DSBmiss Testing DataSharing Testing Default Testing Default2 Testing Default3 Testing Default4 Ignoring failures in Default4 that may contain unsupported legacy events Testing Fed Testing FetchBW Testing FetchLat Testing Flops Testing FpScalar Testing FpVector Testing Frontend Testing HPC Testing IcMiss Testing InsType Testing LSD Testing LockCont Testing MachineClears Testing Machine_Clears Testing Mem Testing MemOffcore Testing MemoryBW Testing MemoryBound Testing MemoryLat Testing MemoryTLB Testing Memory_BW Testing Memory_Lat Testing MicroSeq Testing OS Testing Offcore Testing PGO Testing Pipeline Testing PortsUtil Testing Power Testing Prefetches Testing Ret Testing Retire Testing SMT Testing Snoop Testing SoC Testing Summary Testing TmaL1 Testing TmaL2 Testing TmaL3mem Testing TopdownL1 Testing TopdownL2 Testing TopdownL3 Testing TopdownL4 Testing TopdownL5 Testing TopdownL6 Testing smi Testing tma_L1_group Testing tma_L2_group Testing tma_L3_group Testing tma_L4_group Testing tma_L5_group Testing tma_L6_group Testing tma_alu_op_utilization_group Testing tma_assists_group Testing tma_backend_bound_group Testing tma_bad_speculation_group Testing tma_branch_mispredicts_group Testing tma_branch_resteers_group Testing tma_code_stlb_miss_group Testing tma_core_bound_group Testing tma_divider_group Testing tma_dram_bound_group Testing tma_dtlb_load_group Testing tma_dtlb_store_group Testing tma_fetch_bandwidth_group Testing tma_fetch_latency_group Testing tma_fp_arith_group Testing tma_fp_vector_group Testing tma_frontend_bound_group Testing tma_heavy_operations_group Testing tma_icache_misses_group Testing tma_issue2P Testing tma_issueBM Testing tma_issueBW Testing tma_issueComp Testing tma_issueD0 Testing tma_issueFB Testing tma_issueFL Testing tma_issueL1 Testing tma_issueLat Testing tma_issueMC Testing tma_issueMS Testing tma_issueMV Testing tma_issueRFO Testing tma_issueSL Testing tma_issueSO Testing tma_issueSmSt Testing tma_issueSpSt Testing tma_issueSyncxn Testing tma_issueTLB Testing tma_itlb_misses_group Testing tma_l1_bound_group Testing tma_l2_bound_group Testing tma_l3_bound_group Testing tma_light_operations_group Testing tma_load_stlb_miss_group Testing tma_machine_clears_group Testing tma_memory_bound_group Testing tma_microcode_sequencer_group Testing tma_mite_group Testing tma_other_light_ops_group Testing tma_ports_utilization_group Testing tma_ports_utilized_0_group Testing tma_ports_utilized_3m_group Testing tma_retiring_group Testing tma_serializing_operation_group Testing tma_store_bound_group Testing tma_store_stlb_miss_group Testing transaction ---- end(0) ---- 109: perf all metricgroups test : Ok Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_all_metricgroups.sh | 26 +++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh b/tools/perf/tests/shell/stat_all_metricgroups.sh index 1400880ec01f..81bc7070b5ab 100755 --- a/tools/perf/tests/shell/stat_all_metricgroups.sh +++ b/tools/perf/tests/shell/stat_all_metricgroups.sh @@ -12,31 +12,32 @@ if ParanoidAndNotRoot 0 then system_wide_flag="" fi -err=0 + +err=3 +skip=0 for m in $(perf list --raw-dump metricgroups) do echo "Testing $m" result=$(perf stat -M "$m" $system_wide_flag sleep 0.01 2>&1) result_err=$? - if [[ $result_err -gt 0 ]] + if [[ $result_err -eq 0 ]] then + if [[ "$err" -ne 1 ]] + then + err=0 + fi + else if [[ "$result" =~ \ "Access to performance monitoring and observability operations is limited" ]] then echo "Permission failure" echo $result - if [[ $err -eq 0 ]] - then - err=2 # Skip - fi + skip=1 elif [[ "$result" =~ "in per-thread mode, enable system wide" ]] then echo "Permissions - need system wide mode" echo $result - if [[ $err -eq 0 ]] - then - err=2 # Skip - fi + skip=1 elif [[ "$m" == @(Default2|Default3|Default4) ]] then echo "Ignoring failures in $m that may contain unsupported legacy events" @@ -48,4 +49,9 @@ do fi done +if [[ "$err" -eq 3 && "$skip" -eq 1 ]] +then + err=2 +fi + exit $err -- cgit v1.2.3 From d1f9dc67238e716a4cc0ffd7014f501775d5f3ed Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 16 Dec 2025 09:39:49 +0800 Subject: perf Documentation: Correct branch stack sampling call-stack option The correct call-stack option for branch stack sampling should be "stack" instead of "call_stack". Correct it. $perf record -e instructions -j call_stack -- sleep 1 unknown branch filter call_stack, check man page Usage: perf record [] [] or: perf record [] -- [] -j, --branch-filter branch stack filter modes Fixes: 955f6def5590ce6c ("perf record: Add remaining branch filters: "no_cycles", "no_flags" & "hw_index"") Reviewed-by: Namhyung Kim Signed-off-by: Dapeng Mi Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Xudong Hao Cc: Zide Chen Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index c402e74172f6..178f483140ed 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -455,7 +455,7 @@ following filters are defined: - no_tx: only when the target is not in a hardware transaction - abort_tx: only when the target is a hardware transaction abort - cond: conditional branches - - call_stack: save call stack + - stack: save call stack - no_flags: don't save branch flags e.g prediction, misprediction etc - no_cycles: don't save branch cycles - hw_index: save branch hardware index -- cgit v1.2.3 From a66f6242fbf521f8371d6cda5eaee6dc7668683b Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 31 Dec 2025 12:12:28 +0000 Subject: perf vendor events arm64: Remove uncountable events These events are never countable by the PMU and are only intended to be used as external inputs to trace. Therefore showing them in 'perf list' is misleading so remove them. The generator script doesn't emit these events when used with the new telemetry-solution input files [1]. 'perf list' should only show countable events because there are events that are sometimes implemented, sometimes countable and sometimes not, for example TRB_TRIG. If we always include any implemented events whether they are countable or not then it's not possible to tell whether they are usable in perf without going to the docs, defeating the point of 'perf list'. It's also not useful yet to display implemented events that are not countable (for help in using trace rather than perf stat), because PMU_OVFS and PMU_HOVFS are practically always implemented and TRB_TRIG is always implemented when there is TRBE. [1]: https://gitlab.arm.com/telemetry-solution/telemetry-solution/-/tree/main/data/pmu/cpu Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Akio Kakuno Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: Yoshihiro Furudera Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json | 8 -------- tools/perf/pmu-events/arch/arm64/common-and-microarch.json | 12 ------------ tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json | 10 ---------- 3 files changed, 30 deletions(-) delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json delete mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json deleted file mode 100644 index d8b7b9f9e5fa..000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "ArchStdEvent": "PMU_OVFS" - }, - { - "ArchStdEvent": "PMU_HOVFS" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json index 2416d9f8a83d..468cb085d879 100644 --- a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json +++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json @@ -432,24 +432,12 @@ "EventName": "TRB_WRAP", "BriefDescription": "Trace buffer current write pointer wrapped" }, - { - "PublicDescription": "PMU overflow, counters accessible to EL1 and EL0", - "EventCode": "0x400D", - "EventName": "PMU_OVFS", - "BriefDescription": "PMU overflow, counters accessible to EL1 and EL0" - }, { "PublicDescription": "Trace buffer Trigger Event", "EventCode": "0x400E", "EventName": "TRB_TRIG", "BriefDescription": "Trace buffer Trigger Event" }, - { - "PublicDescription": "PMU overflow, counters reserved for use by EL2", - "EventCode": "0x400F", - "EventName": "PMU_HOVFS", - "BriefDescription": "PMU overflow, counters reserved for use by EL2" - }, { "PublicDescription": "PE Trace Unit external output 0", "EventCode": "0x4010", diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json b/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json deleted file mode 100644 index 65bd6cdd0dd5..000000000000 --- a/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "ArchStdEvent": "PMU_OVFS", - "BriefDescription": "This event counts the event generated each time one of the condition occurs described in Arm Architecture Reference Manual for A-profile architecture. This event is only for output to the trace unit." - }, - { - "ArchStdEvent": "PMU_HOVFS", - "BriefDescription": "This event counts the event generated each time an event is counted by an event counter and all of the condition occur described in Arm Architecture Reference Manual for A-profile architecture. This event is only for output to the trace unit." - } -] -- cgit v1.2.3 From 6e052cfe47c7fea0ac7cae271c69c69f0db3ca0e Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 2 Jan 2026 12:15:43 +0000 Subject: perf tools: Dump callchain context marker names These are hard to interpret in the raw output because they are printed as hex but are defined in perf_event.h as decimal. Make it much easier to read the raw callchains by just printing their names. For example: $ perf report -D 1798195372321 0x4638 [0xb0]: PERF_RECORD_SAMPLE(IP, 0x4002): 44922/44922: 0x7c8046dd3400 period: 120218 addr: 0 ... FP chain: nr:12 ..... 0: fffffffffffffe00 (PERF_CONTEXT_USER) ..... 1: 00007c8046dd3400 ..... 2: 00007c8046db86d3 Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra [ Add PERF_CONTEXT_USER_DEFERRED too, as per Namhyung's review comment ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 4236503c8f6c..65fa9bdff1b8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -841,6 +841,28 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) } } +static const char *callchain_context_str(u64 ip) +{ + switch (ip) { + case PERF_CONTEXT_HV: + return " (PERF_CONTEXT_HV)"; + case PERF_CONTEXT_KERNEL: + return " (PERF_CONTEXT_KERNEL)"; + case PERF_CONTEXT_USER: + return " (PERF_CONTEXT_USER)"; + case PERF_CONTEXT_GUEST: + return " (PERF_CONTEXT_GUEST)"; + case PERF_CONTEXT_GUEST_KERNEL: + return " (PERF_CONTEXT_GUEST_KERNEL)"; + case PERF_CONTEXT_GUEST_USER: + return " (PERF_CONTEXT_GUEST_USER)"; + case PERF_CONTEXT_USER_DEFERRED: + return " (PERF_CONTEXT_USER_DEFERRED)"; + default: + return ""; + } +} + static void callchain__printf(struct evsel *evsel, struct perf_sample *sample) { @@ -853,8 +875,9 @@ static void callchain__printf(struct evsel *evsel, printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); for (i = 0; i < callchain->nr; i++) - printf("..... %2d: %016" PRIx64 "\n", - i, callchain->ips[i]); + printf("..... %2d: %016" PRIx64 "%s\n", + i, callchain->ips[i], + callchain_context_str(callchain->ips[i])); if (sample->deferred_callchain) printf("...... (deferred)\n"); -- cgit v1.2.3 From c55741148294700115ecacd19cb9c173721c1b6a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 30 Dec 2025 17:52:13 +0100 Subject: perf addr_location: Update outdated comment The function addr_location__put() was renamed addr_location__exit() in commit 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy functions"). Make the comment preceding the function consistent with the function itself. Reviewed-by: Ian Rogers Signed-off-by: Julia Lawall Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kexin Sun Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ratnadira Widyasari Cc: Xutong Ma Cc: Yumbo Lyu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/addr_location.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c index 007a2f5df9a6..81a0b79c5e10 100644 --- a/tools/perf/util/addr_location.c +++ b/tools/perf/util/addr_location.c @@ -24,7 +24,7 @@ void addr_location__init(struct addr_location *al) * The preprocess_sample method will return with reference counts for the * in it, when done using (and perhaps getting ref counts if needing to * keep a pointer to one of those entries) it must be paired with - * addr_location__put(), so that the refcounts can be decremented. + * addr_location__exit(), so that the refcounts can be decremented. */ void addr_location__exit(struct addr_location *al) { -- cgit v1.2.3 From 6fbf129c49905e9e34801b362c9c30ae383d7a90 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:53 +0800 Subject: libbpf: Add BTF permutation support for type reordering Introduce btf__permute() API to allow in-place rearrangement of BTF types. This function reorganizes BTF type order according to a provided array of type IDs, updating all type references to maintain consistency. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-2-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 42 +++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 176 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b136572e889a..bf75f770d29a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -5887,3 +5887,136 @@ int btf__relocate(struct btf *btf, const struct btf *base_btf) btf->owns_base = false; return libbpf_err(err); } + +struct btf_permute { + struct btf *btf; + __u32 *id_map; + __u32 start_offs; +}; + +/* Callback function to remap individual type ID references */ +static int btf_permute_remap_type_id(__u32 *type_id, void *ctx) +{ + struct btf_permute *p = ctx; + __u32 new_id = *type_id; + + /* refer to the base BTF or VOID type */ + if (new_id < p->btf->start_id) + return 0; + + if (new_id >= btf__type_cnt(p->btf)) + return -EINVAL; + + *type_id = p->id_map[new_id - p->btf->start_id + p->start_offs]; + return 0; +} + +int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt, + const struct btf_permute_opts *opts) +{ + struct btf_permute p; + struct btf_ext *btf_ext; + void *nt, *new_types = NULL; + __u32 *order_map = NULL; + int err = 0, i; + __u32 n, id, start_offs = 0; + + if (!OPTS_VALID(opts, btf_permute_opts)) + return libbpf_err(-EINVAL); + + if (btf__base_btf(btf)) { + n = btf->nr_types; + } else { + if (id_map[0] != 0) + return libbpf_err(-EINVAL); + n = btf__type_cnt(btf); + start_offs = 1; + } + + if (id_map_cnt != n) + return libbpf_err(-EINVAL); + + /* record the sequence of types */ + order_map = calloc(id_map_cnt, sizeof(*id_map)); + if (!order_map) { + err = -ENOMEM; + goto done; + } + + new_types = calloc(btf->hdr->type_len, 1); + if (!new_types) { + err = -ENOMEM; + goto done; + } + + if (btf_ensure_modifiable(btf)) { + err = -ENOMEM; + goto done; + } + + for (i = start_offs; i < id_map_cnt; i++) { + id = id_map[i]; + if (id < btf->start_id || id >= btf__type_cnt(btf)) { + err = -EINVAL; + goto done; + } + id -= btf->start_id - start_offs; + /* cannot be mapped to the same ID */ + if (order_map[id]) { + err = -EINVAL; + goto done; + } + order_map[id] = i + btf->start_id - start_offs; + } + + p.btf = btf; + p.id_map = id_map; + p.start_offs = start_offs; + nt = new_types; + for (i = start_offs; i < id_map_cnt; i++) { + struct btf_field_iter it; + const struct btf_type *t; + __u32 *type_id; + int type_size; + + id = order_map[i]; + t = btf__type_by_id(btf, id); + type_size = btf_type_size(t); + memcpy(nt, t, type_size); + + /* fix up referenced IDs for BTF */ + err = btf_field_iter_init(&it, nt, BTF_FIELD_ITER_IDS); + if (err) + goto done; + while ((type_id = btf_field_iter_next(&it))) { + err = btf_permute_remap_type_id(type_id, &p); + if (err) + goto done; + } + + nt += type_size; + } + + /* fix up referenced IDs for btf_ext */ + btf_ext = OPTS_GET(opts, btf_ext, NULL); + if (btf_ext) { + err = btf_ext_visit_type_ids(btf_ext, btf_permute_remap_type_id, &p); + if (err) + goto done; + } + + for (nt = new_types, i = 0; i < id_map_cnt - start_offs; i++) { + btf->type_offs[i] = nt - new_types; + nt += btf_type_size(nt); + } + + free(order_map); + free(btf->types_data); + btf->types_data = new_types; + return 0; + +done: + free(order_map); + free(new_types); + return libbpf_err(err); +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index cc01494d6210..b30008c267c0 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -281,6 +281,48 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); */ LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf); +struct btf_permute_opts { + size_t sz; + /* optional .BTF.ext info along the main BTF info */ + struct btf_ext *btf_ext; + size_t :0; +}; +#define btf_permute_opts__last_field btf_ext + +/** + * @brief **btf__permute()** rearranges BTF types in-place according to a specified ID mapping + * @param btf BTF object to permute + * @param id_map Array mapping original type IDs to new IDs + * @param id_map_cnt Number of elements in @id_map + * @param opts Optional parameters, including BTF extension data for reference updates + * @return 0 on success, negative error code on failure + * + * **btf__permute()** reorders BTF types based on the provided @id_map array, + * updating all internal type references to maintain consistency. The function + * operates in-place, modifying the BTF object directly. + * + * For **base BTF**: + * - @id_map must include all types from ID 0 to `btf__type_cnt(btf) - 1` + * - @id_map_cnt must be `btf__type_cnt(btf)` + * - Mapping is defined as `id_map[original_id] = new_id` + * - `id_map[0]` must be 0 (void type cannot be moved) + * + * For **split BTF**: + * - @id_map must include only split types (types added on top of the base BTF) + * - @id_map_cnt must be `btf__type_cnt(btf) - btf__type_cnt(btf__base_btf(btf))` + * - Mapping is defined as `id_map[original_id - start_id] = new_id` + * - `start_id` equals `btf__type_cnt(btf__base_btf(btf))` + * + * After permutation, all type references within the BTF data and optional + * BTF extension (if provided via @opts) are updated automatically. + * + * On error, returns a negative error code and sets errno: + * - `-EINVAL`: Invalid parameters or invalid ID mapping + * - `-ENOMEM`: Memory allocation failure + */ +LIBBPF_API int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt, + const struct btf_permute_opts *opts); + struct btf_dump; struct btf_dump_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 84fb90a016c9..d18fbcea7578 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -453,4 +453,5 @@ LIBBPF_1.7.0 { bpf_map__exclusive_program; bpf_prog_assoc_struct_ops; bpf_program__assoc_struct_ops; + btf__permute; } LIBBPF_1.6.0; -- cgit v1.2.3 From a3acd7d43462a7f7429301afad3c0059276f427e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:54 +0800 Subject: selftests/bpf: Add test cases for btf__permute functionality This patch introduces test cases for the btf__permute function to ensure it works correctly with both base BTF and split BTF scenarios. The test suite includes: - test_permute_base: Validates permutation on base BTF - test_permute_split: Tests permutation on split BTF Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-3-dolinux.peng@gmail.com --- .../testing/selftests/bpf/prog_tests/btf_permute.c | 244 +++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_permute.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_permute.c b/tools/testing/selftests/bpf/prog_tests/btf_permute.c new file mode 100644 index 000000000000..04ade5ad77ac --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_permute.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Xiaomi */ + +#include +#include +#include "btf_helpers.h" + +static void permute_base_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[2] FUNC 'f' type_id=6 linkage=static", + "[3] PTR '(anon)' type_id=4", + "[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[5] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[6] FUNC_PROTO '(anon)' ret_type_id=4 vlen=1\n" + "\t'p' type_id=3"); +} + +/* Ensure btf__permute works as expected in the base-BTF scenario */ +static void test_permute_base(void) +{ + struct btf *btf; + __u32 permute_ids[7]; + int err; + + btf = btf__new_empty(); + if (!ASSERT_OK_PTR(btf, "empty_main_btf")) + return; + + btf__add_int(btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf, 1); /* [2] ptr to int */ + btf__add_struct(btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(btf, 1); /* [5] int (*)(int *p); */ + btf__add_func_param(btf, "p", 2); + btf__add_func(btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_base")) + goto done; + permute_base_check(btf); + + /* ids[0] must be 0 for base BTF */ + permute_ids[0] = 4; /* [0] -> [0] */ + permute_ids[1] = 0; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* id_map_cnt is invalid */ + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 4; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 6; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Type ID must be valid */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 3; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 7; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + +done: + btf__free(btf); +} + +static void permute_split_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] FUNC 'f' type_id=5 linkage=static", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0"); +} + +/* Ensure btf__permute works as expected in the split-BTF scenario */ +static void test_permute_split(void) +{ + struct btf *split_btf = NULL, *base_btf = NULL; + __u32 permute_ids[4]; + int err, start_id; + + base_btf = btf__new_empty(); + if (!ASSERT_OK_PTR(base_btf, "empty_main_btf")) + return; + + btf__add_int(base_btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(base_btf, 1); /* [2] ptr to int */ + VALIDATE_RAW_BTF( + base_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + split_btf = btf__new_empty_split(base_btf); + if (!ASSERT_OK_PTR(split_btf, "empty_split_btf")) + goto cleanup; + btf__add_struct(split_btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(split_btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(split_btf, 1); /* [5] int (*)(int p); */ + btf__add_func_param(split_btf, "p", 2); + btf__add_func(split_btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + split_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + start_id = btf__type_cnt(base_btf); + permute_ids[3 - start_id] = 6; /* [3] -> [6] */ + permute_ids[4 - start_id] = 3; /* [4] -> [3] */ + permute_ids[5 - start_id] = 5; /* [5] -> [5] */ + permute_ids[6 - start_id] = 4; /* [6] -> [4] */ + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_split")) + goto cleanup; + permute_split_check(split_btf); + + /* + * For split BTF, id_map_cnt must equal to the number of types + * added on top of base BTF + */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 3; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Can not map to base ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 2; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + +cleanup: + btf__free(split_btf); + btf__free(base_btf); +} + +void test_btf_permute(void) +{ + if (test__start_subtest("permute_base")) + test_permute_base(); + if (test__start_subtest("permute_split")) + test_permute_split(); +} -- cgit v1.2.3 From 230e7d7de5a8d2bcda2a5cdcc8c176c09a63e331 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:55 +0800 Subject: tools/resolve_btfids: Support BTF sorting feature This introduces a new BTF sorting phase that specifically sorts BTF types by name in ascending order, so that the binary search can be used to look up types. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-4-dolinux.peng@gmail.com --- tools/bpf/resolve_btfids/main.c | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index df39982f51df..343d08050116 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -850,6 +850,67 @@ static int dump_raw_btf(struct btf *btf, const char *out_path) return 0; } +/* + * Sort types by name in ascending order resulting in all + * anonymous types being placed before named types. + */ +static int cmp_type_names(const void *a, const void *b, void *priv) +{ + struct btf *btf = (struct btf *)priv; + const struct btf_type *ta = btf__type_by_id(btf, *(__u32 *)a); + const struct btf_type *tb = btf__type_by_id(btf, *(__u32 *)b); + const char *na, *nb; + + na = btf__str_by_offset(btf, ta->name_off); + nb = btf__str_by_offset(btf, tb->name_off); + return strcmp(na, nb); +} + +static int sort_btf_by_name(struct btf *btf) +{ + __u32 *permute_ids = NULL, *id_map = NULL; + int nr_types, i, err = 0; + __u32 start_id = 0, start_offs = 1, id; + + if (btf__base_btf(btf)) { + start_id = btf__type_cnt(btf__base_btf(btf)); + start_offs = 0; + } + nr_types = btf__type_cnt(btf) - start_id; + + permute_ids = calloc(nr_types, sizeof(*permute_ids)); + if (!permute_ids) { + err = -ENOMEM; + goto out; + } + + id_map = calloc(nr_types, sizeof(*id_map)); + if (!id_map) { + err = -ENOMEM; + goto out; + } + + for (i = 0, id = start_id; i < nr_types; i++, id++) + permute_ids[i] = id; + + qsort_r(permute_ids + start_offs, nr_types - start_offs, + sizeof(*permute_ids), cmp_type_names, btf); + + for (i = 0; i < nr_types; i++) { + id = permute_ids[i] - start_id; + id_map[id] = i + start_id; + } + + err = btf__permute(btf, id_map, nr_types, NULL); + if (err) + pr_err("FAILED: btf permute: %s\n", strerror(-err)); + +out: + free(permute_ids); + free(id_map); + return err; +} + static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) { int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); @@ -1025,6 +1086,9 @@ int main(int argc, const char **argv) if (load_btf(&obj)) goto out; + if (sort_btf_by_name(obj.btf)) + goto out; + if (elf_collect(&obj)) goto out; -- cgit v1.2.3 From d836e5e64992363b5fa9b121f1ab4a1a1b89162d Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:56 +0800 Subject: libbpf: Optimize type lookup with binary search for sorted BTF This patch introduces binary search optimization for BTF type lookups when the BTF instance contains sorted types. The optimization significantly improves performance when searching for types in large BTF instances with sorted types. For unsorted BTF, the implementation falls back to the original linear search. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-5-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 90 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index bf75f770d29a..5a6ac40439e4 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -92,6 +92,8 @@ struct btf { * - for split BTF counts number of types added on top of base BTF. */ __u32 nr_types; + /* the start IDs of named types in sorted BTF */ + int named_start_id; /* if not NULL, points to the base BTF on top of which the current * split BTF is based */ @@ -897,46 +899,81 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) return type_id; } -__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name, + __s32 start_id) { - __u32 i, nr_types = btf__type_cnt(btf); - - if (!strcmp(type_name, "void")) - return 0; - - for (i = 1; i < nr_types; i++) { - const struct btf_type *t = btf__type_by_id(btf, i); - const char *name = btf__name_by_offset(btf, t->name_off); - - if (name && !strcmp(type_name, name)) - return i; + const struct btf_type *t; + const char *tname; + __s32 l, r, m; + + l = start_id; + r = btf__type_cnt(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } } - return libbpf_err(-ENOENT); + return btf__type_cnt(btf); } static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, - const char *type_name, __u32 kind) + const char *type_name, __s32 kind) { - __u32 i, nr_types = btf__type_cnt(btf); + __u32 nr_types = btf__type_cnt(btf); + const struct btf_type *t; + const char *tname; + __s32 id; - if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) - return 0; + if (start_id < btf->start_id) { + id = btf_find_by_name_kind(btf->base_btf, start_id, + type_name, kind); + if (id >= 0) + return id; + start_id = btf->start_id; + } - for (i = start_id; i < nr_types; i++) { - const struct btf_type *t = btf__type_by_id(btf, i); - const char *name; + if (kind == BTF_KIND_UNKN || strcmp(type_name, "void") == 0) + return 0; - if (btf_kind(t) != kind) - continue; - name = btf__name_by_offset(btf, t->name_off); - if (name && !strcmp(type_name, name)) - return i; + if (btf->named_start_id > 0 && type_name[0]) { + start_id = max(start_id, btf->named_start_id); + id = btf_find_type_by_name_bsearch(btf, type_name, start_id); + for (; id < nr_types; id++) { + t = btf__type_by_id(btf, id); + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, type_name) != 0) + return libbpf_err(-ENOENT); + if (kind < 0 || btf_kind(t) == kind) + return id; + } + } else { + for (id = start_id; id < nr_types; id++) { + t = btf_type_by_id(btf, id); + if (kind > 0 && btf_kind(t) != kind) + continue; + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, type_name) == 0) + return id; + } } return libbpf_err(-ENOENT); } +/* the kind value of -1 indicates that kind matching should be skipped */ +__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +{ + return btf_find_by_name_kind(btf, 1, type_name, -1); +} + __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind) { @@ -1006,6 +1043,7 @@ static struct btf *btf_new_empty(struct btf *base_btf) btf->fd = -1; btf->ptr_sz = sizeof(void *); btf->swapped_endian = false; + btf->named_start_id = 0; if (base_btf) { btf->base_btf = base_btf; @@ -1057,6 +1095,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b btf->start_id = 1; btf->start_str_off = 0; btf->fd = -1; + btf->named_start_id = 0; if (base_btf) { btf->base_btf = base_btf; @@ -1715,6 +1754,7 @@ static void btf_invalidate_raw_data(struct btf *btf) free(btf->raw_data_swapped); btf->raw_data_swapped = NULL; } + btf->named_start_id = 0; } /* Ensure BTF is ready to be modified (by splitting into a three memory -- cgit v1.2.3 From 33ecca574f1c27cbf560aee9c1b3045dcb9f8de5 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:57 +0800 Subject: libbpf: Verify BTF sorting This patch checks whether the BTF is sorted by name in ascending order. If sorted, binary search will be used when looking up types. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-6-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 5a6ac40439e4..808e53961ed6 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -899,6 +899,30 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) return type_id; } +static void btf_check_sorted(struct btf *btf) +{ + __u32 i, n, named_start_id = 0; + + n = btf__type_cnt(btf); + for (i = btf->start_id + 1; i < n; i++) { + struct btf_type *ta = btf_type_by_id(btf, i - 1); + struct btf_type *tb = btf_type_by_id(btf, i); + const char *na = btf__str_by_offset(btf, ta->name_off); + const char *nb = btf__str_by_offset(btf, tb->name_off); + + if (strcmp(na, nb) > 0) + return; + + if (named_start_id == 0 && na[0] != '\0') + named_start_id = i - 1; + if (named_start_id == 0 && nb[0] != '\0') + named_start_id = i; + } + + if (named_start_id) + btf->named_start_id = named_start_id; +} + static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name, __s32 start_id) { @@ -1130,6 +1154,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b err = err ?: btf_sanity_check(btf); if (err) goto done; + btf_check_sorted(btf); done: if (err) { -- cgit v1.2.3 From 9282a42a1fe16c61a253293af439d6fecd8b5b6c Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 21:00:03 +0800 Subject: btf: Refactor the code by calling str_is_empty Calling the str_is_empty function to clarify the code and no functional changes are introduced. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-12-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 34 +++++++++++++++++----------------- tools/lib/bpf/libbpf.c | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 808e53961ed6..83fe79ffcb8f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2134,7 +2134,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* byte_sz must be power of 2 */ if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) @@ -2182,7 +2182,7 @@ int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* byte_sz must be one of the explicitly allowed values */ @@ -2237,7 +2237,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2314,7 +2314,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2415,7 +2415,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, if (!m) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2453,7 +2453,7 @@ static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2511,7 +2511,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) return libbpf_err(-EINVAL); /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (value < INT_MIN || value > UINT_MAX) return libbpf_err(-E2BIG); @@ -2588,7 +2588,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) return libbpf_err(-EINVAL); /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* decompose and invalidate raw data */ @@ -2628,7 +2628,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) */ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) { - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); switch (fwd_kind) { @@ -2664,7 +2664,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) */ int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id) { - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id, 0); @@ -2716,7 +2716,7 @@ int btf__add_restrict(struct btf *btf, int ref_type_id) */ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) { - if (!value || !value[0]) + if (str_is_empty(value)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 0); @@ -2733,7 +2733,7 @@ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) */ int btf__add_type_attr(struct btf *btf, const char *value, int ref_type_id) { - if (!value || !value[0]) + if (str_is_empty(value)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 1); @@ -2752,7 +2752,7 @@ int btf__add_func(struct btf *btf, const char *name, { int id; - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL && linkage != BTF_FUNC_EXTERN) @@ -2838,7 +2838,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) if (!p) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2873,7 +2873,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED && linkage != BTF_VAR_GLOBAL_EXTERN) @@ -2922,7 +2922,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (btf_ensure_modifiable(btf)) @@ -2999,7 +2999,7 @@ static int btf_add_decl_tag(struct btf *btf, const char *value, int ref_type_id, struct btf_type *t; int sz, value_off; - if (!value || !value[0] || component_idx < -1) + if (str_is_empty(value) || component_idx < -1) return libbpf_err(-EINVAL); if (validate_type_id(ref_type_id)) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6ea81701e274..bbcfd72b07d5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2904,7 +2904,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, var_extra = btf_var(var); map_name = btf__name_by_offset(obj->btf, var->name_off); - if (map_name == NULL || map_name[0] == '\0') { + if (str_is_empty(map_name)) { pr_warn("map #%d: empty name.\n", var_idx); return -EINVAL; } @@ -4281,7 +4281,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj) if (!sym_is_extern(sym)) continue; ext_name = elf_sym_str(obj, sym->st_name); - if (!ext_name || !ext_name[0]) + if (str_is_empty(ext_name)) continue; ext = obj->externs; -- cgit v1.2.3 From c3a9a27c79e4e5d8bdb20a26d16230111207e98e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:25 -0800 Subject: KVM: selftests: Add a test to verify APICv updates (while L2 is active) Add a test to verify KVM correctly handles a variety of edge cases related to APICv updates, and in particular updates that are triggered while L2 is actively running. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/include/x86/apic.h | 4 + .../selftests/kvm/x86/vmx_apicv_updates_test.c | 155 +++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..6f00bd8271c2 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -115,6 +115,7 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test +TEST_GEN_PROGS_x86 += x86/vmx_apicv_updates_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h index 80fe9f69b38d..d42a0998d868 100644 --- a/tools/testing/selftests/kvm/include/x86/apic.h +++ b/tools/testing/selftests/kvm/include/x86/apic.h @@ -32,6 +32,7 @@ #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 #define APIC_IRR 0x200 #define APIC_ICR 0x300 #define APIC_LVTCMCI 0x2f0 @@ -68,6 +69,9 @@ #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 +#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32) +#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10) + void apic_disable(void); void xapic_enable(void); void x2apic_enable(void); diff --git a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c new file mode 100644 index 000000000000..337c53fddeff --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define GOOD_IPI_VECTOR 0xe0 +#define BAD_IPI_VECTOR 0xf0 + +static volatile int good_ipis_received; + +static void good_ipi_handler(struct ex_regs *regs) +{ + good_ipis_received++; +} + +static void bad_ipi_handler(struct ex_regs *regs) +{ + GUEST_FAIL("Received \"bad\" IPI; ICR MMIO write should have been ignored"); +} + +static void l2_guest_code(void) +{ + x2apic_enable(); + vmcall(); + + xapic_enable(); + xapic_write_reg(APIC_ID, 1 << 24); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + /* Modify APIC ID to coerce KVM into inhibiting APICv. */ + xapic_enable(); + xapic_write_reg(APIC_ID, 1 << 24); + + /* + * Generate+receive an IRQ without doing EOI to get an IRQ set in vISR + * but not SVI. APICv should be inhibited due to running with a + * modified APIC ID. + */ + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR); + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ID), 1 << 24); + + /* Enable IRQs and verify the IRQ was received. */ + sti_nop(); + GUEST_ASSERT_EQ(good_ipis_received, 1); + + /* + * Run L2 to switch to x2APIC mode, which in turn will uninhibit APICv, + * as KVM should force the APIC ID back to its default. + */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN)); + GUEST_ASSERT(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD); + + /* + * Scribble the APIC access page to verify KVM disabled xAPIC + * virtualization in vmcs01, and to verify that KVM flushes L1's TLB + * when L2 switches back to accelerated xAPIC mode. + */ + xapic_write_reg(APIC_ICR2, 0xdeadbeefu); + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | BAD_IPI_VECTOR); + + /* + * Verify the IRQ is still in-service and emit an EOI to verify KVM + * propagates the highest vISR vector to SVI when APICv is activated + * (and does so even if APICv was uninhibited while L2 was active). + */ + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), + BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR))); + x2apic_write_reg(APIC_EOI, 0); + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0); + + /* + * Run L2 one more time to switch back to xAPIC mode to verify that KVM + * handles the x2APIC => xAPIC transition and inhibits APICv while L2 + * is active. + */ + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT(!(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD)); + + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR); + /* Re-enable IRQs, as VM-Exit clears RFLAGS.IF. */ + sti_nop(); + GUEST_ASSERT_EQ(good_ipis_received, 2); + + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), + BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR))); + xapic_write_reg(APIC_EOI, 0); + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + struct vmx_pages *vmx; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtualize_apic_accesses(vmx, vm); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + vm_install_exception_handler(vm, BAD_IPI_VECTOR, bad_ipi_handler); + vm_install_exception_handler(vm, GOOD_IPI_VECTOR, good_ipi_handler); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + } + + /* + * Verify at least two IRQs were injected. Unfortunately, KVM counts + * re-injected IRQs (e.g. if delivering the IRQ hits an EPT violation), + * so being more precise isn't possible given the current stats. + */ + TEST_ASSERT(vcpu_get_stat(vcpu, irq_injections) >= 2, + "Wanted at least 2 IRQ injections, got %lu\n", + vcpu_get_stat(vcpu, irq_injections)); + + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:44 -0800 Subject: KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of KVM Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit value. Per the APM and Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.") (which is arguably more trustworthy than KVM), offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE Track exit_code as a single u64 to prevent reintroducing bugs where KVM neglects to correctly set bits 63:32. Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +- arch/x86/include/uapi/asm/svm.h | 32 +++++++++---------- arch/x86/kvm/svm/hyperv.c | 1 - arch/x86/kvm/svm/nested.c | 13 ++------ arch/x86/kvm/svm/sev.c | 36 ++++++++-------------- arch/x86/kvm/svm/svm.c | 7 ++--- arch/x86/kvm/svm/svm.h | 4 +-- arch/x86/kvm/trace.h | 6 ++-- include/hyperv/hvgdk.h | 2 +- tools/testing/selftests/kvm/include/x86/svm.h | 3 +- .../kvm/x86/svm_nested_soft_inject_test.c | 4 +-- 11 files changed, 42 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 50ece197c98a..edde36097ddc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 650e3256ea7d..010a45c9f614 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -103,38 +103,38 @@ #define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ -#define SVM_VMGEXIT_MMIO_READ 0x80000001 -#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 -#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 -#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 -#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_MMIO_READ 0x80000001ull +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002ull +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003ull +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004ull +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005ull #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 -#define SVM_VMGEXIT_PSC 0x80000010 -#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 -#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 -#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_PSC 0x80000010ull +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011ull +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012ull +#define SVM_VMGEXIT_AP_CREATION 0x80000013ull #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 -#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 -#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018ull +#define SVM_VMGEXIT_SAVIC 0x8000001aull #define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 #define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL -#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd -#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffdull +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffeull #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ /* SW_EXITINFO1[3:0] */ \ (((((u64)reason_set) & 0xf)) | \ /* SW_EXITINFO1[11:4] */ \ ((((u64)reason_code) & 0xff) << 4)) -#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffffull /* Exit code reserved for hypervisor/software use */ -#define SVM_EXIT_SW 0xf0000000 +#define SVM_EXIT_SW 0xf0000000ull -#define SVM_EXIT_ERR -1 +#define SVM_EXIT_ERR -1ull #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 088f6429b24c..3ec580d687f5 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 666b5a36c15d..5aa0512e09c9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, * correctly fill in the high bits of exit_info_1. */ vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_code_hi = 0; vmcb->control.exit_info_1 = (1ULL << 32); vmcb->control.exit_info_2 = fault->address; } @@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->int_vector = from->int_vector; to->int_state = from->int_state; to->exit_code = from->exit_code; - to->exit_code_hi = from->exit_code_hi; to->exit_info_1 = from->exit_info_1; to->exit_info_2 = from->exit_info_2; to->exit_int_info = from->exit_int_info; @@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, enter_guest_mode(vcpu); /* - * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, - * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, + * exit_int_info_err, next_rip, insn_len, insn_bytes. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && @@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1051,7 +1048,6 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; - vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; @@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) static int nested_svm_intercept(struct vcpu_svm *svm) { - u32 exit_code = svm->vmcb->control.exit_code; + u64 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; if (svm_is_vmrun_failure(exit_code)) @@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; - vmcb->control.exit_code_hi = 0; if (ex->has_error_code) vmcb->control.exit_info_1 = ex->error_code; @@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->int_vector = from->int_vector; dst->int_state = from->int_state; dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; dst->exit_info_1 = from->exit_info_1; dst->exit_info_2 = from->exit_info_2; dst->exit_int_info = from->exit_int_info; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 28150506b18c..f67525007089 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3270,11 +3270,6 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; struct ghcb *ghcb = svm->sev_es.ghcb; - u64 exit_code; /* * The GHCB protocol so far allows for the following data @@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); /* Copy the GHCB exit information into the VMCB fields */ - exit_code = kvm_ghcb_get_sw_exit_code(svm); - control->exit_code = lower_32_bits(exit_code); - control->exit_code_hi = upper_32_bits(exit_code); + control->exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); @@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - u64 exit_code; u64 reason; - /* - * Retrieve the exit code now even though it may not be marked valid - * as it could help with debugging. - */ - exit_code = kvm_get_cached_sw_exit_code(control); - /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; @@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (exit_code) { + switch (control->exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: @@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: + /* + * Print the exit code even though it may not be marked valid as it + * could help with debugging. + */ if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", - exit_code); + control->exit_code); } else { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", - exit_code); + control->exit_code); dump_ghcb(svm); } @@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - u64 ghcb_gpa, exit_code; + u64 ghcb_gpa; int ret; /* Validate the GHCB */ @@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_get_cached_sw_exit_code(control); - switch (exit_code) { + switch (control->exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) @@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = -EINVAL; break; default: - ret = svm_invoke_exit_handler(vcpu, exit_code); + ret = svm_invoke_exit_handler(vcpu, control->exit_code); } return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3caf7a21679f..a28cd61d87ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); - pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_code:", control->exit_code); pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); @@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; - u32 exit_code = svm->vmcb->control.exit_code; /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(vcpu, exit_code); + return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } static int pre_svm_run(struct kvm_vcpu *vcpu) @@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; - vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3360ac36e071..a22433680c73 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached { u32 int_ctl; u32 int_vector; u32 int_state; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; @@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e79bc9cb7162..e7fdbe9efc90 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic, #define kvm_print_exit_reason(exit_reason, isa) \ (isa == KVM_ISA_VMX) ? \ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ - __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + __print_symbolic_u64(exit_reason, SVM_EXIT_REASONS), \ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ (isa == KVM_ISA_VMX) ? \ - __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + __print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ @@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); * Tracepoint for #VMEXIT reinjected to the guest */ TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, + TP_PROTO(__u64 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h index dd6d4939ea29..384c3f3ff4a5 100644 --- a/include/hyperv/hvgdk.h +++ b/include/hyperv/hvgdk.h @@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments { #define HV_VMCB_NESTED_ENLIGHTENMENTS 31 /* Synthetic VM-Exit */ -#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_EXITCODE_ENL 0xf0000000ull #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) /* VM_PARTITION_ASSIST_PAGE */ diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h index 29cffd0a9181..10b30b38bb3f 100644 --- a/tools/testing/selftests/kvm/include/x86/svm.h +++ b/tools/testing/selftests/kvm/include/x86/svm.h @@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index 7b6481d6c0d3..4bd1655f9e6d 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, - "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); @@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, - "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); -- cgit v1.2.3 From 69cb6ca52da095d300cf42666c903ae787a762dd Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Mon, 12 Jan 2026 19:56:31 -0800 Subject: tools/net/ynl: suppress jobserver warning in ynltool version detection When building ynltool with parallel make (-jN), a warning is emitted: make[1]: warning: jobserver unavailable: using -j1. Add '+' to parent make rule. The warning trips up local runs of NIPA's ingest_mdir.py, which correctly fails on make warnings. This occurs because SRC_VERSION uses $(shell make ...) to make kernelversion. The $(shell) function inherits make's MAKEFLAGS env var which specifies "--jobserver-auth=R,W" pointing to file descriptors that the invoked make sub-shell does not have access to. Observed with: $ make --version | head -1 GNU Make 4.3 Instead of suppressing MAKEFLAGS and foregoing all future MAKEFLAGS (some of which may be desirable, such as variable overrides) or introducing a new make target, we instead just ignore the warning by piping stderr to /dev/null. If 'make kernelversion' fails, the ' || echo "unknown"' phrase will catch the failure. Before: NIPA ingest_mdir.py: ynl Full series FAIL (1) Generated files up to date; build has 1 warnings/errors; no diff in generated; After: NIPA ingest_mdir.py: Series level tests: ynl OKAY Validated output: $ ./ynltool/ynltool --version ynltool 6.19.0-rc4 Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260112-ynl-make-fix-v1-1-c399e76925ad@meta.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynltool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/ynltool/Makefile b/tools/net/ynl/ynltool/Makefile index f5b1de32daa5..48b0f32050f0 100644 --- a/tools/net/ynl/ynltool/Makefile +++ b/tools/net/ynl/ynltool/Makefile @@ -13,7 +13,7 @@ endif CFLAGS += -I../lib -I../generated -I../../../include/uapi/ SRC_VERSION := \ - $(shell make --no-print-directory -sC ../../../.. kernelversion || \ + $(shell make --no-print-directory -sC ../../../.. kernelversion 2>/dev/null || \ echo "unknown") CFLAGS += -DSRC_VERSION='"$(SRC_VERSION)"' -- cgit v1.2.3 From b324192e36ecea472077f9c9e32bcac8bbafeed6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:35 -0800 Subject: selftests: net: py: teach ksft_pr() multi-line safety Make printing multi-line logs easier by automatically prefixing each line in ksft_pr(). Make use of this when formatting exceptions. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260113000740.255360-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 0a96f88bb60a..6cdfb8afccb5 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -32,8 +32,23 @@ class KsftTerminate(KeyboardInterrupt): def ksft_pr(*objs, **kwargs): + """ + Print logs to stdout. + + Behaves like print() but log lines will be prefixed + with # to prevent breaking the TAP output formatting. + + Extra arguments (on top of what print() supports): + line_pfx - add extra string before each line + """ + sep = kwargs.pop("sep", " ") + pfx = kwargs.pop("line_pfx", "") + pfx = "#" + (" " + pfx if pfx else "") kwargs["flush"] = True - print("#", *objs, **kwargs) + + text = sep.join(str(obj) for obj in objs) + prefixed = f"\n{pfx} ".join(text.split('\n')) + print(pfx, prefixed, **kwargs) def _fail(*args): @@ -170,9 +185,7 @@ def ksft_flush_defer(): entry.exec_only() except Exception: ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!") - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Defer Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Defer Exception|") KSFT_RESULT = False @@ -331,9 +344,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cnt_key = 'xfail' except BaseException as e: stop |= isinstance(e, KeyboardInterrupt) - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Exception|") if stop: ksft_pr(f"Stopping tests due to {type(e).__name__}.") KSFT_RESULT = False @@ -343,9 +354,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): try: ksft_flush_defer() except BaseException as e: - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Exception|") if isinstance(e, KeyboardInterrupt): ksft_pr() ksft_pr("WARN: defer() interrupted, cleanup may be incomplete.") -- cgit v1.2.3 From ce0f92dc737c65d705d83ea9529d8fb9a2194241 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:36 -0800 Subject: selftests: net: py: teach cmd() how to print itself Teach cmd() how to print itself, to make debug prints easier. Example output (leading # due to ksft_pr()): # CMD: /root/ksft-net-drv/drivers/net/gro # EXIT: 1 # STDOUT: ipv6 with ext header does coalesce: # STDERR: Expected {200 }, Total 1 packets # Received {100 [!=200]100 [!=0]}, Total 2 packets. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260113000740.255360-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 824f039d384c..37243103aee3 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -41,7 +41,9 @@ class cmd: self.ret = None self.ksft_term_fd = None + self.host = host self.comm = comm + if host: self.proc = host.cmd(comm) else: @@ -99,6 +101,27 @@ class cmd: raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % (self.proc.args, stdout, stderr), self) + def __repr__(self): + def str_fmt(name, s): + name += ': ' + return (name + s.strip().replace('\n', '\n' + ' ' * len(name))) + + ret = "CMD" + if self.host: + ret += "[remote]" + if self.ret is None: + ret += f" (unterminated): {self.comm}\n" + elif self.ret == 0: + ret += f" (success): {self.comm}\n" + else: + ret += f": {self.comm}\n" + ret += f" EXIT: {self.ret}\n" + if self.stdout: + ret += str_fmt(" STDOUT", self.stdout) + "\n" + if self.stderr: + ret += str_fmt(" STDERR", self.stderr) + "\n" + return ret.strip() + class bkg(cmd): """ -- cgit v1.2.3 From d131da6d7282829c775cf70f5f1db1576e5c1273 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:37 -0800 Subject: selftests: drv-net: gro: use cmd print Now that cmd() can be printed directly remove the old formatting. Before: # fragmented ip6 doesn't coalesce: # Expected {200 100 100 }, Total 3 packets # Received {200 100 }, Total 2 packets. # /root/ksft-net-drv/drivers/net/gro: incorrect number of packets Now: # CMD: drivers/net/gro --ipv6 --dmac 9e:[...] # EXIT: 1 # STDOUT: fragmented ip6 doesn't coalesce: # STDERR: Expected {200 100 100 }, Total 3 packets # Received {200 100 }, Total 2 packets. # /root/ksft-net-drv/drivers/net/gro: incorrect number of packets Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260113000740.255360-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index ba83713bf7b5..4e0fb19d1527 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -142,8 +142,7 @@ def test(cfg, protocol, test_name): if rx_proc.ret == 0: return - ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) - ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + ksft_pr(rx_proc) if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") -- cgit v1.2.3 From 8171f6a76b2250d93a550566af6b915dc92edc75 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:38 -0800 Subject: selftests: drv-net: gro: improve feature config We'll need to do a lot more feature handling to test HW-GRO and LRO. Clean up the feature handling for SW GRO a bit to let the next commit focus on the new test cases, only. Make sure HW GRO-like features are not enabled for the SW tests. Be more careful about changing features as "nothing changed" situations may result in non-zero error code from ethtool. Don't disable TSO on the local interface (receiver) when running over netdevsim, we just want GSO to break up the segments on the sender. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260113000740.255360-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.py | 39 +++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 4e0fb19d1527..3c30749ead39 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -20,7 +20,7 @@ Test cases: import os from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import NetDrvEpEnv, KsftXfailEx -from lib.py import cmd, defer, bkg, ip +from lib.py import bkg, cmd, defer, ethtool, ip from lib.py import ksft_variants @@ -70,6 +70,27 @@ def _set_mtu_restore(dev, mtu, host): defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host) +def _set_ethtool_feat(dev, current, feats, host=None): + s2n = {True: "on", False: "off"} + + new = ["-K", dev] + old = ["-K", dev] + no_change = True + for name, state in feats.items(): + new += [name, s2n[state]] + old += [name, s2n[current[name]["active"]]] + + if current[name]["active"] != state: + no_change = False + if current[name]["fixed"]: + raise KsftXfailEx(f"Device does not support {name}") + if no_change: + return + + ethtool(" ".join(new), host=host) + defer(ethtool, " ".join(old), host=host) + + def _setup(cfg, test_name): """ Setup hardware loopback mode for GRO testing. """ @@ -77,6 +98,11 @@ def _setup(cfg, test_name): cfg.bin_local = cfg.test_dir / "gro" cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + if not hasattr(cfg, "feat"): + cfg.feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}", + host=cfg.remote, json=True)[0] + # "large" test needs at least 4k MTU if test_name == "large": _set_mtu_restore(cfg.dev, 4096, None) @@ -88,15 +114,22 @@ def _setup(cfg, test_name): _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": True, + "rx-gro-hw": False, + "large-receive-offload": False}) + try: # Disable TSO for local tests cfg.require_nsim() # will raise KsftXfailEx if not running on nsim - cmd(f"ethtool -K {cfg.ifname} gro on tso off") - cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote) + _set_ethtool_feat(cfg.remote_ifname, cfg.remote_feat, + {"tcp-segmentation-offload": False}, + host=cfg.remote) except KsftXfailEx: pass + def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" -- cgit v1.2.3 From d3b35898de024796c43415f9535fd0bc69cb8f1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:39 -0800 Subject: selftests: drv-net: gro: run the test against HW GRO and LRO Run the test against HW GRO and LRO. NICs I have pass the base cases. Interestingly all are happy to build GROs larger than 64k. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260113000740.255360-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.py | 72 +++++++++++++++++------ tools/testing/selftests/drivers/net/lib/py/env.py | 7 ++- 2 files changed, 60 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 3c30749ead39..1ab85590c439 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -87,11 +87,15 @@ def _set_ethtool_feat(dev, current, feats, host=None): if no_change: return - ethtool(" ".join(new), host=host) + eth_cmd = ethtool(" ".join(new), host=host) defer(ethtool, " ".join(old), host=host) + # If ethtool printed something kernel must have modified some features + if eth_cmd.stdout: + ksft_pr(eth_cmd) -def _setup(cfg, test_name): + +def _setup(cfg, mode, test_name): """ Setup hardware loopback mode for GRO testing. """ if not hasattr(cfg, "bin_remote"): @@ -108,16 +112,49 @@ def _setup(cfg, test_name): _set_mtu_restore(cfg.dev, 4096, None) _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) - flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" - irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" - - _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) - _write_defer_restore(cfg, irq_path, "10", defer_undo=True) - - _set_ethtool_feat(cfg.ifname, cfg.feat, - {"generic-receive-offload": True, - "rx-gro-hw": False, - "large-receive-offload": False}) + if mode == "sw": + flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" + irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" + + _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) + _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": True, + "rx-gro-hw": False, + "large-receive-offload": False}) + elif mode == "hw": + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": False, + "rx-gro-hw": True, + "large-receive-offload": False}) + + # Some NICs treat HW GRO as a GRO sub-feature so disabling GRO + # will also clear HW GRO. Use a hack of installing XDP generic + # to skip SW GRO, even when enabled. + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + if not feat["rx-gro-hw"]["active"]: + ksft_pr("Driver clears HW GRO and SW GRO is cleared, using generic XDP workaround") + prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" + ip(f"link set dev {cfg.ifname} xdpgeneric obj {prog} sec xdp") + defer(ip, f"link set dev {cfg.ifname} xdpgeneric off") + + # Attaching XDP may change features, fetch the latest state + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + + _set_ethtool_feat(cfg.ifname, feat, + {"generic-receive-offload": True, + "rx-gro-hw": True, + "large-receive-offload": False}) + elif mode == "lro": + # netdevsim advertises LRO for feature inheritance testing with + # bonding/team tests but it doesn't actually perform the offload + cfg.require_nsim(nsim_test=False) + + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": False, + "rx-gro-hw": False, + "large-receive-offload": True}) try: # Disable TSO for local tests @@ -133,19 +170,20 @@ def _setup(cfg, test_name): def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" - for protocol in ["ipv4", "ipv6", "ipip"]: - for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: - yield protocol, test_name + for mode in ["sw", "hw", "lro"]: + for protocol in ["ipv4", "ipv6", "ipip"]: + for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: + yield mode, protocol, test_name @ksft_variants(_gro_variants()) -def test(cfg, protocol, test_name): +def test(cfg, mode, protocol, test_name): """Run a single GRO test with retries.""" ipver = "6" if protocol[-1] == "6" else "4" cfg.require_ipver(ipver) - _setup(cfg, test_name) + _setup(cfg, mode, test_name) base_cmd_args = [ f"--{protocol}", diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 63495376e654..41cc248ac848 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -248,9 +248,12 @@ class NetDrvEpEnv(NetDrvEnvBase): if not self.addr_v[ipver] or not self.remote_addr_v[ipver]: raise KsftSkipEx(f"Test requires IPv{ipver} connectivity") - def require_nsim(self): - if self._ns is None: + def require_nsim(self, nsim_test=True): + """Require or exclude netdevsim for this test""" + if nsim_test and self._ns is None: raise KsftXfailEx("Test only works on netdevsim") + if nsim_test is False and self._ns is not None: + raise KsftXfailEx("Test does not work on netdevsim") def _require_cmd(self, comm, key, host=None): cached = self._required_cmd.get(comm, {}) -- cgit v1.2.3 From fe074aaa5329246706c652ccba6602eecbed80a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:40 -0800 Subject: selftests: drv-net: gro: break out all individual test cases GRO test groups the cases into categories, e.g. "tcp" case checks coalescing in presence of: - packets with bad csum, - sequence number mismatch, - timestamp option value mismatch, - different TCP options. Since we now have TAP support grouping the cases like that lowers our reporting granularity. This matters even more for NICs performing HW GRO and LRO since it appears that most implementation have _some_ bugs. Flagging the whole group of tests as failed prevents us from catching regressions in the things that work today. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260113000740.255360-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.c | 441 +++++++++++++++++------------ tools/testing/selftests/drivers/net/gro.py | 65 ++++- 2 files changed, 312 insertions(+), 194 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c index 751a8103f408..e76c618704cf 100644 --- a/tools/testing/selftests/drivers/net/gro.c +++ b/tools/testing/selftests/drivers/net/gro.c @@ -3,26 +3,45 @@ * This testsuite provides conformance testing for GRO coalescing. * * Test cases: - * 1.data + * + * data_*: * Data packets of the same size and same header setup with correct * sequence numbers coalesce. The one exception being the last data * packet coalesced: it can be smaller than the rest and coalesced * as long as it is in the same flow. - * 2.ack + * - data_same: same size packets coalesce + * - data_lrg_sml: large then small coalesces + * - data_sml_lrg: small then large doesn't coalesce + * + * ack: * Pure ACK does not coalesce. - * 3.flags - * Specific test cases: no packets with PSH, SYN, URG, RST set will - * be coalesced. - * 4.tcp + * + * flags_*: + * No packets with PSH, SYN, URG, RST set will be coalesced. + * - flags_psh, flags_syn, flags_rst, flags_urg + * + * tcp_*: * Packets with incorrect checksum, non-consecutive seqno and * different TCP header options shouldn't coalesce. Nit: given that * some extension headers have paddings, such as timestamp, headers - * that are padding differently would not be coalesced. - * 5.ip: - * Packets with different (ECN, TTL, TOS) header, ip options or - * ip fragments (ipv6) shouldn't coalesce. - * 6.large: + * that are padded differently would not be coalesced. + * - tcp_csum: incorrect checksum + * - tcp_seq: non-consecutive sequence numbers + * - tcp_ts: different timestamps + * - tcp_opt: different TCP options + * + * ip_*: + * Packets with different (ECN, TTL, TOS) header, IP options or + * IP fragments shouldn't coalesce. + * - ip_ecn, ip_tos: shared between IPv4/IPv6 + * - ip_ttl, ip_opt, ip_frag4: IPv4 only + * - ip_id_df*: IPv4 IP ID field coalescing tests + * - ip_frag6, ip_v6ext_*: IPv6 only + * + * large_*: * Packets larger than GRO_MAX_SIZE packets shouldn't coalesce. + * - large_max: exceeding max size + * - large_rem: remainder handling * * MSS is defined as 4096 - header because if it is too small * (i.e. 1500 MTU - header), it will result in many packets, @@ -79,6 +98,15 @@ #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +enum flush_id_case { + FLUSH_ID_DF1_INC, + FLUSH_ID_DF1_FIXED, + FLUSH_ID_DF0_INC, + FLUSH_ID_DF0_FIXED, + FLUSH_ID_DF1_INC_FIXED, + FLUSH_ID_DF1_FIXED_INC, +}; + static const char *addr6_src = "fdaa::2"; static const char *addr6_dst = "fdaa::1"; static const char *addr4_src = "192.168.1.200"; @@ -95,7 +123,6 @@ static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; static bool ipip; -static const int num_flush_id_cases = 6; static void vlog(const char *fmt, ...) { @@ -127,19 +154,19 @@ static void setup_sock_filter(int fd) /* Overridden later if exthdrs are used: */ opt_ipproto_off = ipproto_off; - if (strcmp(testname, "ip") == 0) { - if (proto == PF_INET) - optlen = sizeof(struct ip_timestamp); - else { - BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); - BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); - BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); - - /* same size for HBH and Fragment extension header types */ - optlen = MIN_EXTHDR_SIZE; - opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) - + offsetof(struct ip6_ext, ip6e_nxt); - } + if (strcmp(testname, "ip_opt") == 0) { + optlen = sizeof(struct ip_timestamp); + } else if (strcmp(testname, "ip_frag6") == 0 || + strcmp(testname, "ip_v6ext_same") == 0 || + strcmp(testname, "ip_v6ext_diff") == 0) { + BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); + + /* same size for HBH and Fragment extension header types */ + optlen = MIN_EXTHDR_SIZE; + opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) + + offsetof(struct ip6_ext, ip6e_nxt); } /* this filter validates the following: @@ -648,7 +675,8 @@ static void fix_ip4_checksum(struct iphdr *iph) iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); } -static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, + enum flush_id_case tcase) { static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -667,7 +695,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); switch (tcase) { - case 0: /* DF=1, Incrementing - should coalesce */ + case FLUSH_ID_DF1_INC: /* DF=1, Incrementing - should coalesce */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -675,7 +703,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 1: /* DF=1, Fixed - should coalesce */ + case FLUSH_ID_DF1_FIXED: /* DF=1, Fixed - should coalesce */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -683,7 +711,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(8); break; - case 2: /* DF=0, Incrementing - should coalesce */ + case FLUSH_ID_DF0_INC: /* DF=0, Incrementing - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -691,7 +719,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 3: /* DF=0, Fixed - should coalesce */ + case FLUSH_ID_DF0_FIXED: /* DF=0, Fixed - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -699,9 +727,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(8); break; - case 4: /* DF=1, two packets incrementing, and one fixed - should - * coalesce only the first two packets - */ + case FLUSH_ID_DF1_INC_FIXED: /* DF=1, two packets incrementing, and + * one fixed - should coalesce only the + * first two packets + */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -713,9 +742,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) send_three = true; break; - case 5: /* DF=1, two packets fixed, and one incrementing - should - * coalesce only the first two packets - */ + case FLUSH_ID_DF1_FIXED_INC: /* DF=1, two packets fixed, and one + * incrementing - should coalesce only + * the first two packets + */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -739,16 +769,6 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) } } -static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) -{ - for (int i = 0; i < num_flush_id_cases; i++) { - sleep(1); - send_flush_id_case(fd, daddr, i); - sleep(1); - write_packet(fd, fin_pkt, total_hdr_len, daddr); - } -} - static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -1030,108 +1050,128 @@ static void gro_sender(void) daddr.sll_halen = ETH_ALEN; create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1); - if (strcmp(testname, "data") == 0) { + /* data sub-tests */ + if (strcmp(testname, "data_same") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "data_lrg_sml") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "data_sml_lrg") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* ack test */ } else if (strcmp(testname, "ack") == 0) { send_ack(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "flags") == 0) { + + /* flags sub-tests */ + } else if (strcmp(testname, "flags_psh") == 0) { send_flags(txfd, &daddr, 1, 0, 0, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "flags_syn") == 0) { send_flags(txfd, &daddr, 0, 1, 0, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "flags_rst") == 0) { send_flags(txfd, &daddr, 0, 0, 1, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "flags_urg") == 0) { send_flags(txfd, &daddr, 0, 0, 0, 1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "tcp") == 0) { + + /* tcp sub-tests */ + } else if (strcmp(testname, "tcp_csum") == 0) { send_changed_checksum(txfd, &daddr); - /* Adding sleep before sending FIN so that it is not - * received prior to other packets. - */ usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_seq") == 0) { send_changed_seq(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_ts") == 0) { send_changed_ts(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_opt") == 0) { send_diff_opt(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "ip") == 0) { + + /* ip sub-tests - shared between IPv4 and IPv6 */ + } else if (strcmp(testname, "ip_ecn") == 0) { send_changed_ECN(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "ip_tos") == 0) { send_changed_tos(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - if (proto == PF_INET) { - /* Modified packets may be received out of order. - * Sleep function added to enforce test boundaries - * so that fin pkts are not received prior to other pkts. - */ - sleep(1); - send_changed_ttl(txfd, &daddr); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - send_ip_options(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - send_fragment4(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - test_flush_id(txfd, &daddr, fin_pkt); - } else if (proto == PF_INET6) { - sleep(1); - send_fragment6(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - /* send IPv6 packets with ext header with same payload */ - send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - /* send IPv6 packets with ext header with different payload */ - send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } - } else if (strcmp(testname, "large") == 0) { - /* 20 is the difference between min iphdr size - * and min ipv6hdr size. Like MAX_HDR_SIZE, - * MAX_PAYLOAD is defined with the larger header of the two. - */ + + /* ip sub-tests - IPv4 only */ + } else if (strcmp(testname, "ip_ttl") == 0) { + send_changed_ttl(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_opt") == 0) { + send_ip_options(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_frag4") == 0) { + send_fragment4(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df0_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df0_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* ip sub-tests - IPv6 only */ + } else if (strcmp(testname, "ip_frag6") == 0) { + send_fragment6(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_v6ext_same") == 0) { + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_v6ext_diff") == 0) { + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* large sub-tests */ + } else if (strcmp(testname, "large_max") == 0) { int offset = (proto == PF_INET && !ipip) ? 20 : 0; int remainder = (MAX_PAYLOAD + offset) % MSS; send_large(txfd, &daddr, remainder); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "large_rem") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; send_large(txfd, &daddr, remainder + 1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else { - error(1, 0, "Unknown testcase"); + error(1, 0, "Unknown testcase: %s", testname); } if (close(txfd)) @@ -1155,126 +1195,153 @@ static void gro_receiver(void) memset(correct_payload, 0, sizeof(correct_payload)); - if (strcmp(testname, "data") == 0) { + /* data sub-tests */ + if (strcmp(testname, "data_same") == 0) { printf("pure data packet of same size: "); correct_payload[0] = PAYLOAD_LEN * 2; check_recv_pkts(rxfd, correct_payload, 1); - + } else if (strcmp(testname, "data_lrg_sml") == 0) { printf("large data packets followed by a smaller one: "); correct_payload[0] = PAYLOAD_LEN * 1.5; check_recv_pkts(rxfd, correct_payload, 1); - + } else if (strcmp(testname, "data_sml_lrg") == 0) { printf("small data packets followed by a larger one: "); correct_payload[0] = PAYLOAD_LEN / 2; correct_payload[1] = PAYLOAD_LEN; check_recv_pkts(rxfd, correct_payload, 2); + + /* ack test */ } else if (strcmp(testname, "ack") == 0) { printf("duplicate ack and pure ack: "); check_recv_pkts(rxfd, correct_payload, 3); - } else if (strcmp(testname, "flags") == 0) { + + /* flags sub-tests */ + } else if (strcmp(testname, "flags_psh") == 0) { correct_payload[0] = PAYLOAD_LEN * 3; correct_payload[1] = PAYLOAD_LEN * 2; - printf("psh flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "flags_syn") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; correct_payload[1] = 0; correct_payload[2] = PAYLOAD_LEN * 2; printf("syn flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - + } else if (strcmp(testname, "flags_rst") == 0) { + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; printf("rst flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - + } else if (strcmp(testname, "flags_urg") == 0) { + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; printf("urg flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - } else if (strcmp(testname, "tcp") == 0) { + + /* tcp sub-tests */ + } else if (strcmp(testname, "tcp_csum") == 0) { correct_payload[0] = PAYLOAD_LEN; correct_payload[1] = PAYLOAD_LEN; - correct_payload[2] = PAYLOAD_LEN; - correct_payload[3] = PAYLOAD_LEN; - printf("changed checksum does not coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "tcp_seq") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("Wrong Seq number doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - - printf("Different timestamp doesn't coalesce: "); + } else if (strcmp(testname, "tcp_ts") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + correct_payload[3] = PAYLOAD_LEN; + printf("Different timestamp doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 4); - - printf("Different options doesn't coalesce: "); + } else if (strcmp(testname, "tcp_opt") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + printf("Different options doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - } else if (strcmp(testname, "ip") == 0) { + + /* ip sub-tests - shared between IPv4 and IPv6 */ + } else if (strcmp(testname, "ip_ecn") == 0) { correct_payload[0] = PAYLOAD_LEN; correct_payload[1] = PAYLOAD_LEN; - printf("different ECN doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "ip_tos") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("different tos doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - if (proto == PF_INET) { - printf("different ttl doesn't coalesce: "); - check_recv_pkts(rxfd, correct_payload, 2); - - printf("ip options doesn't coalesce: "); - correct_payload[2] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 3); - - printf("fragmented ip4 doesn't coalesce: "); - check_recv_pkts(rxfd, correct_payload, 2); - - /* is_atomic checks */ - printf("DF=1, Incrementing - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=1, Fixed - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=0, Incrementing - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=0, Fixed - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - - printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - } else if (proto == PF_INET6) { - /* GRO doesn't check for ipv6 hop limit when flushing. - * Hence no corresponding test to the ipv4 case. - */ - printf("fragmented ip6 doesn't coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - correct_payload[2] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 3); - - printf("ipv6 with ext header does coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("ipv6 with ext header with different payloads doesn't coalesce: "); - correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - } - } else if (strcmp(testname, "large") == 0) { + /* ip sub-tests - IPv4 only */ + } else if (strcmp(testname, "ip_ttl") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("different ttl doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_opt") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + printf("ip options doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "ip_frag4") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("fragmented ip4 doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_id_df1_inc") == 0) { + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df1_fixed") == 0) { + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df0_inc") == 0) { + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df0_fixed") == 0) { + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) { + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) { + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + /* ip sub-tests - IPv6 only */ + } else if (strcmp(testname, "ip_frag6") == 0) { + /* GRO doesn't check for ipv6 hop limit when flushing. + * Hence no corresponding test to the ipv4 case. + */ + printf("fragmented ip6 doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "ip_v6ext_same") == 0) { + printf("ipv6 with ext header does coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_v6ext_diff") == 0) { + printf("ipv6 with ext header with different payloads doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + /* large sub-tests */ + } else if (strcmp(testname, "large_max") == 0) { int offset = (proto == PF_INET && !ipip) ? 20 : 0; int remainder = (MAX_PAYLOAD + offset) % MSS; @@ -1282,14 +1349,18 @@ static void gro_receiver(void) correct_payload[1] = remainder; printf("Shouldn't coalesce if exceed IP max pkt size: "); check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "large_rem") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; /* last segment sent individually, doesn't start new segment */ - correct_payload[0] = correct_payload[0] - remainder; + correct_payload[0] = (MAX_PAYLOAD + offset) - remainder; correct_payload[1] = remainder + 1; correct_payload[2] = remainder + 1; + printf("last segment sent individually: "); check_recv_pkts(rxfd, correct_payload, 3); } else { - error(1, 0, "Test case error, should never trigger"); + error(1, 0, "Test case error: unknown testname %s", testname); } if (close(rxfd)) diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 1ab85590c439..1bb8af571456 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -9,12 +9,29 @@ binary in different configurations and checking for correct packet coalescing behavior. Test cases: - - data: Data packets with same size/headers and correct seq numbers coalesce + - data_same: Same size data packets coalesce + - data_lrg_sml: Large packet followed by smaller one coalesces + - data_sml_lrg: Small packet followed by larger one doesn't coalesce - ack: Pure ACK packets do not coalesce - - flags: Packets with PSH, SYN, URG, RST flags do not coalesce - - tcp: Packets with incorrect checksum, non-consecutive seqno don't coalesce - - ip: Packets with different ECN, TTL, TOS, or IP options don't coalesce - - large: Packets larger than GRO_MAX_SIZE don't coalesce + - flags_psh: Packets with PSH flag don't coalesce + - flags_syn: Packets with SYN flag don't coalesce + - flags_rst: Packets with RST flag don't coalesce + - flags_urg: Packets with URG flag don't coalesce + - tcp_csum: Packets with incorrect checksum don't coalesce + - tcp_seq: Packets with non-consecutive seqno don't coalesce + - tcp_ts: Packets with different timestamp options don't coalesce + - tcp_opt: Packets with different TCP options don't coalesce + - ip_ecn: Packets with different ECN don't coalesce + - ip_tos: Packets with different TOS don't coalesce + - ip_ttl: (IPv4) Packets with different TTL don't coalesce + - ip_opt: (IPv4) Packets with IP options don't coalesce + - ip_frag4: (IPv4) IPv4 fragments don't coalesce + - ip_id_df*: (IPv4) IP ID field coalescing tests + - ip_frag6: (IPv6) IPv6 fragments don't coalesce + - ip_v6ext_same: (IPv6) IPv6 ext header with same payload coalesces + - ip_v6ext_diff: (IPv6) IPv6 ext header with different payload doesn't coalesce + - large_max: Packets exceeding GRO_MAX_SIZE don't coalesce + - large_rem: Large packet remainder handling """ import os @@ -107,8 +124,8 @@ def _setup(cfg, mode, test_name): cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}", host=cfg.remote, json=True)[0] - # "large" test needs at least 4k MTU - if test_name == "large": + # "large_*" tests need at least 4k MTU + if test_name.startswith("large_"): _set_mtu_restore(cfg.dev, 4096, None) _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) @@ -170,11 +187,41 @@ def _setup(cfg, mode, test_name): def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" + # Tests that work for all protocols + common_tests = [ + "data_same", "data_lrg_sml", "data_sml_lrg", + "ack", + "flags_psh", "flags_syn", "flags_rst", "flags_urg", + "tcp_csum", "tcp_seq", "tcp_ts", "tcp_opt", + "ip_ecn", "ip_tos", + "large_max", "large_rem", + ] + + # Tests specific to IPv4 + ipv4_tests = [ + "ip_ttl", "ip_opt", "ip_frag4", + "ip_id_df1_inc", "ip_id_df1_fixed", + "ip_id_df0_inc", "ip_id_df0_fixed", + "ip_id_df1_inc_fixed", "ip_id_df1_fixed_inc", + ] + + # Tests specific to IPv6 + ipv6_tests = [ + "ip_frag6", "ip_v6ext_same", "ip_v6ext_diff", + ] + for mode in ["sw", "hw", "lro"]: for protocol in ["ipv4", "ipv6", "ipip"]: - for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: + for test_name in common_tests: yield mode, protocol, test_name + if protocol in ["ipv4", "ipip"]: + for test_name in ipv4_tests: + yield mode, protocol, test_name + elif protocol == "ipv6": + for test_name in ipv6_tests: + yield mode, protocol, test_name + @ksft_variants(_gro_variants()) def test(cfg, mode, protocol, test_name): @@ -215,7 +262,7 @@ def test(cfg, mode, protocol, test_name): ksft_pr(rx_proc) - if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): + if test_name.startswith("large_") and os.environ.get("KSFT_MACHINE_SLOW"): ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") return -- cgit v1.2.3 From a32bb32d019332394aee9e2befea4fec05a672e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Nov 2025 12:15:20 +0000 Subject: selftests: iou-zcrx: test large chunk sizes Add a test using large chunks for zcrx memory area. Signed-off-by: Pavel Begunkov --- tools/testing/selftests/drivers/net/hw/iou-zcrx.c | 72 ++++++++++++++++++---- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 39 ++++++++++++ 2 files changed, 99 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index 62456df947bc..240d13dbc54e 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,23 @@ #include +#define SKIP_CODE 42 + +struct t_io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + static long page_size; #define AREA_SIZE (8192 * page_size) #define SEND_SIZE (512 * 4096) @@ -65,6 +83,8 @@ static bool cfg_oneshot; static int cfg_oneshot_recvs; static int cfg_send_size = SEND_SIZE; static struct sockaddr_in6 cfg_addr; +static unsigned int cfg_rx_buf_len; +static bool cfg_dry_run; static char *payload; static void *area_ptr; @@ -128,14 +148,28 @@ static void setup_zcrx(struct io_uring *ring) if (!ifindex) error(1, 0, "bad interface name: %s", cfg_ifname); - area_ptr = mmap(NULL, - AREA_SIZE, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, - 0, - 0); - if (area_ptr == MAP_FAILED) - error(1, 0, "mmap(): zero copy area"); + if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | + MAP_HUGETLB | MAP_HUGE_2MB, + -1, + 0); + if (area_ptr == MAP_FAILED) { + printf("Can't allocate huge pages\n"); + exit(SKIP_CODE); + } + } else { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + } ring_size = get_refill_ring_size(rq_entries); ring_ptr = mmap(NULL, @@ -157,17 +191,23 @@ static void setup_zcrx(struct io_uring *ring) .flags = 0, }; - struct io_uring_zcrx_ifq_reg reg = { + struct t_io_uring_zcrx_ifq_reg reg = { .if_idx = ifindex, .if_rxq = cfg_queue_id, .rq_entries = rq_entries, .area_ptr = (__u64)(unsigned long)&area_reg, .region_ptr = (__u64)(unsigned long)®ion_reg, + .rx_buf_len = cfg_rx_buf_len, }; - ret = io_uring_register_ifq(ring, ®); - if (ret) + ret = io_uring_register_ifq(ring, (void *)®); + if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP || + ret == -ERANGE)) { + printf("Large chunks are not supported %i\n", ret); + exit(SKIP_CODE); + } else if (ret) { error(1, 0, "io_uring_register_ifq(): %d", ret); + } rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); @@ -323,6 +363,8 @@ static void run_server(void) io_uring_queue_init(512, &ring, flags); setup_zcrx(&ring); + if (cfg_dry_run) + return; add_accept(&ring, fd); @@ -383,7 +425,7 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); cfg_payload_len = max_payload_len; - while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) { + while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) { switch (c) { case 's': if (cfg_client) @@ -418,6 +460,12 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_send_size = strtoul(optarg, NULL, 0); break; + case 'x': + cfg_rx_buf_len = page_size * strtoul(optarg, NULL, 0); + break; + case 'd': + cfg_dry_run = true; + break; } } diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 712c806508b5..7f596a33eb2b 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -7,6 +7,7 @@ from lib.py import ksft_run, ksft_exit, KsftSkipEx from lib.py import NetDrvEpEnv from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen +SKIP_CODE = 42 def _get_current_settings(cfg): output = ethtool(f"-g {cfg.ifname}", json=True)[0] @@ -132,6 +133,44 @@ def test_zcrx_rss(cfg) -> None: cmd(tx_cmd, host=cfg.remote) +def test_zcrx_large_chunks(cfg) -> None: + """Test zcrx with large buffer chunks.""" + + cfg.require_ipver('6') + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + (rx_ring, hds_thresh) = _get_current_settings(cfg) + port = rand_port() + + ethtool(f"-G {cfg.ifname} tcp-data-split on") + defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") + + ethtool(f"-G {cfg.ifname} hds-thresh 0") + defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + + ethtool(f"-G {cfg.ifname} rx 64") + defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -x 2" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" + + probe = cmd(rx_cmd + " -d", fail=False) + if probe.ret == SKIP_CODE: + raise KsftSkipEx(probe.stdout) + + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(port, proto="tcp") + cmd(tx_cmd, host=cfg.remote) + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") -- cgit v1.2.3 From e5566f6b1d13e9bc0a458babb880916e212c45fb Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:09 +0200 Subject: selftests: fib-onlink: Remove "wrong nexthop device" IPv4 tests According to the test description, these tests fail because of a wrong nexthop device: # ./fib-onlink-tests.sh -v [...] COMMAND: ip ro add table 254 169.254.101.102/32 via 169.254.3.1 dev veth1 onlink Error: Nexthop has invalid gateway. TEST: Gateway resolves to wrong nexthop device [ OK ] COMMAND: ip ro add table 1101 169.254.102.103/32 via 169.254.7.1 dev veth5 onlink Error: Nexthop has invalid gateway. TEST: Gateway resolves to wrong nexthop device - VRF [ OK ] [...] But this is incorrect. They fail because the gateway addresses are local addresses: # ip -4 address show [...] 28: veth3@if27: mtu 1500 qdisc noqueue state UP group default qlen 1000 link-netns peer_ns-Urqh3o inet 169.254.3.1/24 scope global veth3 [...] 32: veth7@if31: mtu 1500 qdisc noqueue master lisa state UP group default qlen 1000 link-netns peer_ns-Urqh3o inet 169.254.7.1/24 scope global veth7 Therefore, using a local address that matches the nexthop device fails as well: # ip ro add table 254 169.254.101.102/32 via 169.254.3.1 dev veth3 onlink Error: Nexthop has invalid gateway. Using a gateway address with a "wrong" nexthop device is actually valid and allowed: # ip route get 169.254.1.2 169.254.1.2 dev veth1 src 169.254.1.1 uid 0 # ip ro add table 254 169.254.101.102/32 via 169.254.1.2 dev veth3 onlink # echo $? 0 Remove these tests given that their output is confusing and that the scenario that they are testing is already covered by other tests. A subsequent patch will add tests for the nexthop device mismatch scenario. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 6 ------ 1 file changed, 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index ec2d6ceb1f08..1bb1c2289650 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -315,12 +315,6 @@ invalid_onlink_ipv4() "Invalid gw - local unicast address, VRF" run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given" - - run_ip 254 ${TEST_NET4[1]}.102 ${V4ADDRS[p3]} ${NETIFS[p1]} 2 \ - "Gateway resolves to wrong nexthop device" - - run_ip ${VRF_TABLE} ${TEST_NET4[2]}.103 ${V4ADDRS[p7]} ${NETIFS[p5]} 2 \ - "Gateway resolves to wrong nexthop device - VRF" } ################################################################################ -- cgit v1.2.3 From 0a3419f4ba407b9624c315bfd6f0056caf536898 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:10 +0200 Subject: selftests: fib-onlink: Remove "wrong nexthop device" IPv6 tests The command in the test fails as expected because IPv6 forbids a nexthop device mismatch: # ./fib-onlink-tests.sh -v [...] COMMAND: ip -6 ro add table 1101 2001:db8:102::103/128 via 2001:db8:701::64 dev veth5 onlink Error: Nexthop has invalid gateway or device mismatch. TEST: Gateway resolves to wrong nexthop device - VRF [ OK ] [...] Where: # ip route get 2001:db8:701::64 vrf lisa 2001:db8:701::64 dev veth7 table 1101 proto kernel src 2001:db8:701::1 metric 256 pref medium This is in contrast to IPv4 where a nexthop device mismatch is allowed when "onlink" is specified: # ip route get 169.254.7.2 vrf lisa 169.254.7.2 dev veth7 table 1101 src 169.254.7.1 uid 0 # ip ro add table 1101 169.254.102.103/32 via 169.254.7.2 dev veth5 onlink # echo $? 0 Remove these tests in preparation for aligning IPv6 with IPv4 and allowing nexthop device mismatch when "onlink" is specified. A subsequent patch will add tests that verify that both address families allow a nexthop device mismatch with "onlink". Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 7 ------- 1 file changed, 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index 1bb1c2289650..63477be859e3 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -432,13 +432,6 @@ invalid_onlink_ipv6() run_ip6 254 ${TEST_NET6[1]}::101 ${V6ADDRS[p1]} "" 2 \ "No nexthop device given" - - # default VRF validation is done against LOCAL table - # run_ip6 254 ${TEST_NET6[1]}::102 ${V6ADDRS[p3]/::[0-9]/::64} ${NETIFS[p1]} 2 \ - # "Gateway resolves to wrong nexthop device" - - run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::103 ${V6ADDRS[p7]/::[0-9]/::64} ${NETIFS[p5]} 2 \ - "Gateway resolves to wrong nexthop device - VRF" } run_onlink_tests() -- cgit v1.2.3 From 9bf8345fb38ab9bd771bd430073cbd3e912fcf75 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:11 +0200 Subject: selftests: fib-onlink: Add a test case for IPv4 multicast gateway A multicast gateway address should be rejected when "onlink" is specified, but it is only tested as part of the IPv6 tests. Add an equivalent IPv4 test. # ./fib-onlink-tests.sh -v [...] COMMAND: ip ro add table 254 169.254.101.12/32 via 233.252.0.1 dev veth1 onlink Error: Nexthop has invalid gateway. TEST: Invalid gw - multicast address [ OK ] [...] COMMAND: ip ro add table 1101 169.254.102.12/32 via 233.252.0.1 dev veth5 onlink Error: Nexthop has invalid gateway. TEST: Invalid gw - multicast address, VRF [ OK ] [...] Tests passed: 37 Tests failed: 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index 63477be859e3..7a0fd7a91e4e 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -72,7 +72,8 @@ declare -A TEST_NET4IN6IN6 TEST_NET4IN6[1]=10.1.1.254 TEST_NET4IN6[2]=10.2.1.254 -# mcast address +# mcast addresses +MCAST4=233.252.0.1 MCAST6=ff02::1 VRF=lisa @@ -310,9 +311,13 @@ invalid_onlink_ipv4() { run_ip 254 ${TEST_NET4[1]}.11 ${V4ADDRS[p1]} ${NETIFS[p1]} 2 \ "Invalid gw - local unicast address" + run_ip 254 ${TEST_NET4[1]}.12 ${MCAST4} ${NETIFS[p1]} 2 \ + "Invalid gw - multicast address" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.11 ${V4ADDRS[p5]} ${NETIFS[p5]} 2 \ "Invalid gw - local unicast address, VRF" + run_ip ${VRF_TABLE} ${TEST_NET4[2]}.12 ${MCAST4} ${NETIFS[p5]} 2 \ + "Invalid gw - multicast address, VRF" run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given" } -- cgit v1.2.3 From f8f9ee9d8b2ed1f29309399020f2fc30f7f93035 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:13 +0200 Subject: selftests: fib-onlink: Add test cases for nexthop device mismatch Add test cases that verify that when the "onlink" keyword is specified, both address families (with and without VRF) accept routes with a gateway address that is reachable via a different interface than the one specified. Output without "ipv6: Allow for nexthop device mismatch with "onlink"": # ./fib-onlink-tests.sh | grep mismatch TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [FAIL] TEST: nexthop device mismatch [FAIL] Output with "ipv6: Allow for nexthop device mismatch with "onlink"": # ./fib-onlink-tests.sh | grep mismatch TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] That is, the IPv4 tests were always passing, but the IPv6 ones only pass after the specified patch. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-6-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index 7a0fd7a91e4e..b5773ac8847d 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -271,11 +271,15 @@ valid_onlink_ipv4() run_ip 254 ${TEST_NET4[1]}.1 ${CONGW[1]} ${NETIFS[p1]} 0 "unicast connected" run_ip 254 ${TEST_NET4[1]}.2 ${RECGW4[1]} ${NETIFS[p1]} 0 "unicast recursive" + run_ip 254 ${TEST_NET4[1]}.9 ${CONGW[1]} ${NETIFS[p3]} 0 \ + "nexthop device mismatch" log_subsection "VRF ${VRF}" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.1 ${CONGW[3]} ${NETIFS[p5]} 0 "unicast connected" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.2 ${RECGW4[2]} ${NETIFS[p5]} 0 "unicast recursive" + run_ip ${VRF_TABLE} ${TEST_NET4[2]}.10 ${CONGW[3]} ${NETIFS[p7]} 0 \ + "nexthop device mismatch" log_subsection "VRF device, PBR table" @@ -366,12 +370,16 @@ valid_onlink_ipv6() run_ip6 254 ${TEST_NET6[1]}::1 ${V6ADDRS[p1]/::*}::64 ${NETIFS[p1]} 0 "unicast connected" run_ip6 254 ${TEST_NET6[1]}::2 ${RECGW6[1]} ${NETIFS[p1]} 0 "unicast recursive" run_ip6 254 ${TEST_NET6[1]}::3 ::ffff:${TEST_NET4IN6[1]} ${NETIFS[p1]} 0 "v4-mapped" + run_ip6 254 ${TEST_NET6[1]}::a ${V6ADDRS[p1]/::*}::64 ${NETIFS[p3]} 0 \ + "nexthop device mismatch" log_subsection "VRF ${VRF}" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::1 ${V6ADDRS[p5]/::*}::64 ${NETIFS[p5]} 0 "unicast connected" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::2 ${RECGW6[2]} ${NETIFS[p5]} 0 "unicast recursive" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::3 ::ffff:${TEST_NET4IN6[2]} ${NETIFS[p5]} 0 "v4-mapped" + run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::b ${V6ADDRS[p5]/::*}::64 \ + ${NETIFS[p7]} 0 "nexthop device mismatch" log_subsection "VRF device, PBR table" -- cgit v1.2.3 From 9d48c62f6b4ed70ebeea70f52ddb1c6d8613bed4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 12 Jan 2026 19:37:14 +0200 Subject: selftests: drv-net: fix RPS mask handling in toeplitz test The toeplitz.py test passed the hex mask without "0x" prefix (e.g., "300" for CPUs 8,9). The toeplitz.c strtoul() call wrongly parsed this as decimal 300 (0x12c) instead of hex 0x300. Pass the prefixed mask to toeplitz.c, and the unprefixed one to sysfs. Fixes: 9cf9aa77a1f6 ("selftests: drv-net: hw: convert the Toeplitz test to Python") Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260112173715.384843-2-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/toeplitz.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py index d2db5ee9e358..d288c57894f6 100755 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.py +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py @@ -94,12 +94,14 @@ def _configure_rps(cfg, rps_cpus): mask = 0 for cpu in rps_cpus: mask |= (1 << cpu) - mask = hex(mask)[2:] + + mask = hex(mask) # Set RPS bitmap for all rx queues for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"): with open(rps_file, "w", encoding="utf-8") as fp: - fp.write(mask) + # sysfs expects hex without '0x' prefix, toeplitz.c needs the prefix + fp.write(mask[2:]) return mask -- cgit v1.2.3 From cf055f8c000445aa688c53a706ef4f580818eedb Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 12 Jan 2026 19:37:15 +0200 Subject: selftests: drv-net: fix RPS mask handling for high CPU numbers The RPS bitmask bounds check uses ~(RPS_MAX_CPUS - 1) which equals ~15 = 0xfff0, only allowing CPUs 0-3. Change the mask to ~((1UL << RPS_MAX_CPUS) - 1) = ~0xffff to allow CPUs 0-15. Fixes: 5ebfb4cc3048 ("selftests/net: toeplitz test") Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260112173715.384843-3-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/toeplitz.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c index d23b3b0c20a3..285bb17df9c2 100644 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.c +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -485,8 +485,8 @@ static void parse_rps_bitmap(const char *arg) bitmap = strtoul(arg, NULL, 0); - if (bitmap & ~(RPS_MAX_CPUS - 1)) - error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu", + if (bitmap & ~((1UL << RPS_MAX_CPUS) - 1)) + error(1, 0, "rps bitmap 0x%lx out of bounds, max cpu %lu", bitmap, RPS_MAX_CPUS - 1); for (i = 0; i < RPS_MAX_CPUS; i++) -- cgit v1.2.3 From f8ade2342e22e7dbc71af496f07c900f8c69dd54 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:47 +0000 Subject: bpf: return PTR_TO_BTF_ID | PTR_TRUSTED from BPF kfuncs by default Teach the BPF verifier to treat pointers to struct types returned from BPF kfuncs as implicitly trusted (PTR_TO_BTF_ID | PTR_TRUSTED) by default. Returning untrusted pointers to struct types from BPF kfuncs should be considered an exception only, and certainly not the norm. Update existing selftests to reflect the change in register type printing (e.g. `ptr_` becoming `trusted_ptr_` in verifier error messages). Link: https://lore.kernel.org/bpf/aV4nbCaMfIoM0awM@google.com/ Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 46 ++++++++++++++-------- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 4 +- .../struct_ops_kptr_return_fail__wrong_type.c | 2 +- .../selftests/bpf/progs/verifier_global_ptr_args.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 5 files changed, 34 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45733bae271d..faa1ecc1fe9d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14212,26 +14212,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; } else { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; + enum bpf_reg_type type = PTR_TO_BTF_ID; if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) - regs[BPF_REG_0].type |= PTR_UNTRUSTED; - else if (is_kfunc_rcu_protected(&meta)) - regs[BPF_REG_0].type |= MEM_RCU; - - if (is_iter_next_kfunc(&meta)) { - struct bpf_reg_state *cur_iter; - - cur_iter = get_iter_from_state(env->cur_state, &meta); - - if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ - regs[BPF_REG_0].type |= MEM_RCU; - else - regs[BPF_REG_0].type |= PTR_TRUSTED; + type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta) || + (is_iter_next_kfunc(&meta) && + (get_iter_from_state(env->cur_state, &meta) + ->type & MEM_RCU))) { + /* + * If the iterator's constructor (the _new + * function e.g., bpf_iter_task_new) has been + * annotated with BPF kfunc flag + * KF_RCU_PROTECTED and was called within a RCU + * read-side critical section, also propagate + * the MEM_RCU flag to the pointer returned from + * the iterator's next function (e.g., + * bpf_iter_task_next). + */ + type |= MEM_RCU; + } else { + /* + * Any PTR_TO_BTF_ID that is returned from a BPF + * kfunc should by default be treated as + * implicitly trusted. + */ + type |= PTR_TRUSTED; } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = type; + regs[BPF_REG_0].btf_id = ptr_type_id; } if (is_kfunc_ret_null(&meta)) { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 4c0ff01f1a96..6443b320c732 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -272,7 +272,7 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) SEC("?tc") __failure -__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") +__msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -291,7 +291,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") +__failure __msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c index 6a2dd5367802..c8d217e89eea 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c @@ -12,7 +12,7 @@ void bpf_task_release(struct task_struct *p) __ksym; * reject programs returning a referenced kptr of the wrong type. */ SEC("struct_ops/test_return_ref_kptr") -__failure __msg("At program exit the register R0 is not a known value (ptr_or_null_)") +__failure __msg("At program exit the register R0 is not a known value (trusted_ptr_or_null_)") struct task_struct *BPF_PROG(kptr_return_fail__wrong_type, int dummy, struct task_struct *task, struct cgroup *cgrp) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 1204fbc58178..e7dae0cf9c17 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -72,7 +72,7 @@ int trusted_task_arg_nonnull_fail1(void *ctx) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("R1 type=trusted_ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") __msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") int trusted_task_arg_nonnull_fail2(void *ctx) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c8d640802cce..9ca83dce100d 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", -- cgit v1.2.3 From bbdbed193bcf57f1e9c0d9d58c3ad3350bfd0bd1 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:49 +0000 Subject: selftests/bpf: assert BPF kfunc default trusted pointer semantics The BPF verifier was recently updated to treat pointers to struct types returned from BPF kfuncs as implicitly trusted by default. Add a new test case to exercise this new implicit trust semantic. The KF_ACQUIRE flag was dropped from the bpf_get_root_mem_cgroup() kfunc because it returns a global pointer to root_mem_cgroup without performing any explicit reference counting. This makes it an ideal candidate to verify the new implicit trusted pointer semantics. Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-3-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_memcontrol.c | 32 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 5829ffd70f8f..38c5ba70100c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -61,6 +61,7 @@ #include "verifier_masking.skel.h" #include "verifier_may_goto_1.skel.h" #include "verifier_may_goto_2.skel.h" +#include "verifier_memcontrol.skel.h" #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" @@ -202,6 +203,7 @@ void test_verifier_map_ret_val(void) { RUN(verifier_map_ret_val); } void test_verifier_masking(void) { RUN(verifier_masking); } void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } +void test_verifier_memcontrol(void) { RUN(verifier_memcontrol); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_mul(void) { RUN(verifier_mul); } diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c new file mode 100644 index 000000000000..13564956f621 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include +#include +#include +#include "bpf_misc.h" + +SEC("syscall") +__success __retval(0) +int root_mem_cgroup_default_trusted(void *ctx) +{ + unsigned long usage; + struct mem_cgroup *root_mem_cgroup; + + root_mem_cgroup = bpf_get_root_mem_cgroup(); + if (!root_mem_cgroup) + return 1; + + /* + * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID | + * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + usage = bpf_mem_cgroup_usage(root_mem_cgroup); + __sink(usage); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c656807675e09604af09a4b9f3ea466af91b7b7a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:47 +0000 Subject: selftests/bpf: Add tests for loading insn array values with offsets The ldimm64 instruction for map value supports an offset. For insn array maps it wasn't tested before, as normally such instructions aren't generated. However, this is still possible to pass such instructions, so add a few tests to check that correct offsets work properly and incorrect offsets are rejected. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260111153047.8388-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_gotox.c | 208 +++++++++++++++++++++ 1 file changed, 208 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c index d138cc7b1bda..75b0cf2467ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c @@ -240,6 +240,208 @@ static void check_nonstatic_global_other_sec(struct bpf_gotox *skel) bpf_link__destroy(link); } +/* + * The following subtests do not use skeleton rather than to check + * if the test should be skipped. + */ + +static int create_jt_map(__u32 max_entries) +{ + const char *map_name = "jt"; + __u32 key_size = 4; + __u32 value_size = sizeof(struct bpf_insn_array_value); + + return bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, map_name, + key_size, value_size, max_entries, NULL); +} + +static int prog_load(struct bpf_insn *insns, __u32 insn_cnt) +{ + return bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); +} + +static int __check_ldimm64_off_prog_load(__u32 max_entries, __u32 off) +{ + struct bpf_insn insns[] = { + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int map_fd, ret; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + insns[0].imm = map_fd; + insns[1].imm = off; + + ret = prog_load(insns, ARRAY_SIZE(insns)); + close(map_fd); + return ret; +} + +/* + * Check that loads from an instruction array map are only allowed with offsets + * which are multiples of 8 and do not point to outside of the map. + */ +static void check_ldimm64_off_load(struct bpf_gotox *skel __always_unused) +{ + const __u32 max_entries = 10; + int prog_fd; + __u32 off; + + for (off = 0; off < max_entries; off++) { + prog_fd = __check_ldimm64_off_prog_load(max_entries, off * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_off_prog_load")) + return; + close(prog_fd); + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, 7 /* not a multiple of 8 */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, max_entries * 8 /* too large */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } +} + +static int __check_ldimm64_gotox_prog_load(struct bpf_insn *insns, + __u32 insn_cnt, + __u32 off1, __u32 off2) +{ + const __u32 values[] = {5, 7, 9, 11, 13, 15}; + const __u32 max_entries = ARRAY_SIZE(values); + struct bpf_insn_array_value val = {}; + int map_fd, ret, i; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + + for (i = 0; i < max_entries; i++) { + val.orig_off = values[i]; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, + "bpf_map_update_elem")) { + close(map_fd); + return -1; + } + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + /* r1 = &map + offset1 */ + insns[0].imm = map_fd; + insns[1].imm = off1; + + /* r1 += off2 */ + insns[2].imm = off2; + + ret = prog_load(insns, insn_cnt); + close(map_fd); + return ret; +} + +static void reject_offsets(struct bpf_insn *insns, __u32 insn_cnt, __u32 off1, __u32 off2) +{ + int prog_fd; + + prog_fd = __check_ldimm64_gotox_prog_load(insns, insn_cnt, off1, off2); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_gotox_prog_load")) + close(prog_fd); +} + +/* + * Verify a bit more complex programs which include indirect jumps + * and with jump tables loaded with a non-zero offset + */ +static void check_ldimm64_off_gotox(struct bpf_gotox *skel __always_unused) +{ + struct bpf_insn insns[] = { + /* + * The following instructions perform an indirect jump to + * labels below. Thus valid offsets in the map are {0,...,5}. + * The program rewrites the offsets in the instructions below: + * r1 = &map + offset1 + * r1 += offset2 + * r1 = *r1 + * gotox r1 + */ + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0, 0), + + /* case 0: */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* case 1: */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* case 2: */ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* case 3: */ + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* case 4: */ + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_EXIT_INSN(), + /* default: */ + BPF_MOV64_IMM(BPF_REG_0, 5), + BPF_EXIT_INSN(), + }; + int prog_fd, err; + __u32 off1, off2; + + /* allow all combinations off1 + off2 < 6 */ + for (off1 = 0; off1 < 6; off1++) { + for (off2 = 0; off1 + off2 < 6; off2++) { + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = __check_ldimm64_gotox_prog_load(insns, ARRAY_SIZE(insns), + off1 * 8, off2 * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_gotox_prog_load")) + return; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) { + close(prog_fd); + return; + } + + if (!ASSERT_EQ(topts.retval, off1 + off2, "test_run_opts retval")) { + close(prog_fd); + return; + } + + close(prog_fd); + } + } + + /* reject off1 + off2 >= 6 */ + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 3, 8 * 3); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 7, 8 * 0); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 0, 8 * 7); + + /* reject (off1 + off2) % 8 != 0 */ + reject_offsets(insns, ARRAY_SIZE(insns), 3, 3); + reject_offsets(insns, ARRAY_SIZE(insns), 7, 0); + reject_offsets(insns, ARRAY_SIZE(insns), 0, 7); +} + void test_bpf_gotox(void) { struct bpf_gotox *skel; @@ -288,5 +490,11 @@ void test_bpf_gotox(void) if (test__start_subtest("one-map-two-jumps")) __subtest(skel, check_one_map_two_jumps); + if (test__start_subtest("check-ldimm64-off")) + __subtest(skel, check_ldimm64_off_load); + + if (test__start_subtest("check-ldimm64-off-gotox")) + __subtest(skel, check_ldimm64_off_gotox); + bpf_gotox__destroy(skel); } -- cgit v1.2.3 From 7158fc54b2c6f124eec0d7cd13bff69da0172e59 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 30 Dec 2025 08:08:44 +0100 Subject: vdso: Remove struct getcpu_cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cache parameter of getcpu() is useless nowadays for various reasons. * It is never passed by userspace for either the vDSO or syscalls. * It is never used by the kernel. * It could not be made to work on the current vDSO architecture. * The structure definition is not part of the UAPI headers. * vdso_getcpu() is superseded by restartable sequences in any case. Remove the struct and its header. As a side-effect this gets rid of an unwanted inclusion of the linux/ header namespace from vDSO code. [ tglx: Adapt to s390 upstream changes */ Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Acked-by: Heiko Carstens # s390 Link: https://patch.msgid.link/20251230-getcpu_cache-v3-1-fb9c5f880ebe@linutronix.de --- arch/loongarch/vdso/vgetcpu.c | 5 ++--- arch/s390/kernel/vdso/getcpu.c | 3 +-- arch/s390/kernel/vdso/vdso.h | 4 +--- arch/x86/entry/vdso/vgetcpu.c | 5 ++--- arch/x86/include/asm/vdso/processor.h | 4 +--- include/linux/getcpu.h | 19 ------------------- include/linux/syscalls.h | 3 +-- kernel/sys.c | 4 +--- tools/testing/selftests/vDSO/vdso_test_getcpu.c | 4 +--- 9 files changed, 10 insertions(+), 41 deletions(-) delete mode 100644 include/linux/getcpu.h (limited to 'tools') diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c index 73af49242ecd..6f054ec898c7 100644 --- a/arch/loongarch/vdso/vgetcpu.c +++ b/arch/loongarch/vdso/vgetcpu.c @@ -4,7 +4,6 @@ */ #include -#include static __always_inline int read_cpu_id(void) { @@ -28,8 +27,8 @@ static __always_inline int read_cpu_id(void) } extern -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused); -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused) +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused); +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused) { int cpu_id; diff --git a/arch/s390/kernel/vdso/getcpu.c b/arch/s390/kernel/vdso/getcpu.c index 5c5d4a848b76..1e17665616c5 100644 --- a/arch/s390/kernel/vdso/getcpu.c +++ b/arch/s390/kernel/vdso/getcpu.c @@ -2,11 +2,10 @@ /* Copyright IBM Corp. 2020 */ #include -#include #include #include "vdso.h" -int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused) { union tod_clock clk; diff --git a/arch/s390/kernel/vdso/vdso.h b/arch/s390/kernel/vdso/vdso.h index 8cff033dd854..1fe52a6f5a56 100644 --- a/arch/s390/kernel/vdso/vdso.h +++ b/arch/s390/kernel/vdso/vdso.h @@ -4,9 +4,7 @@ #include -struct getcpu_cache; - -int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index e4640306b2e3..6381b472b7c5 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -6,17 +6,16 @@ */ #include -#include #include #include notrace long -__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +__vdso_getcpu(unsigned *cpu, unsigned *node, void *unused) { vdso_read_cpunode(cpu, node); return 0; } -long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +long getcpu(unsigned *cpu, unsigned *node, void *tcache) __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index 7000aeb59aa2..93e0e24e5cb4 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -18,9 +18,7 @@ static __always_inline void cpu_relax(void) native_pause(); } -struct getcpu_cache; - -notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); #endif /* __ASSEMBLER__ */ diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h deleted file mode 100644 index c304dcdb4eac..000000000000 --- a/include/linux/getcpu.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_GETCPU_H -#define _LINUX_GETCPU_H 1 - -/* Cache for getcpu() to speed it up. Results might be a short time - out of date, but will be faster. - - User programs should not refer to the contents of this structure. - I repeat they should not refer to it. If they do they will break - in future kernels. - - It is only a private cache for vgetcpu(). It will change in future kernels. - The user program must store this information per thread (__thread) - If you want 100% accurate information pass NULL instead. */ -struct getcpu_cache { - unsigned long blob[128 / sizeof(long)]; -}; - -#endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cf84d98964b2..23704e006afd 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -59,7 +59,6 @@ struct compat_stat; struct old_timeval32; struct robust_list_head; struct futex_waitv; -struct getcpu_cache; struct old_linux_dirent; struct perf_event_attr; struct file_handle; @@ -718,7 +717,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru); asmlinkage long sys_umask(int mask); asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); -asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache); +asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, void __user *cache); asmlinkage long sys_gettimeofday(struct __kernel_old_timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_settimeofday(struct __kernel_old_timeval __user *tv, diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..f1780ab132a3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -2876,8 +2875,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return error; } -SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index bea8ad54da11..3fe49cbdae98 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -16,9 +16,7 @@ #include "vdso_config.h" #include "vdso_call.h" -struct getcpu_cache; -typedef long (*getcpu_t)(unsigned int *, unsigned int *, - struct getcpu_cache *); +typedef long (*getcpu_t)(unsigned int *, unsigned int *, void *); int main(int argc, char **argv) { -- cgit v1.2.3 From 1d7cf255eefbb479d0eea9aa3b6372a1e52f8c62 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 16 Oct 2025 13:51:25 -0700 Subject: tools headers: Update the linux/unaligned.h copy with the kernel sources To pick up the changes in: vdso: Switch get/put_unaligned() from packed struct to memcpy As the code is dependent on __unqual_scalar_typeof, update also the tools version of compiler_types.h to include this. Signed-off-by: Ian Rogers Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251016205126.2882625-4-irogers@google.com --- tools/include/linux/compiler_types.h | 22 +++++++++++++++++++ tools/include/vdso/unaligned.h | 41 ++++++++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index d09f9dc172a4..890982283a5e 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -40,4 +40,26 @@ #define asm_goto_output(x...) asm goto(x) #endif +/* + * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged. + */ +/* + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' + * is not type-compatible with 'signed char', and we define a separate case. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type: (unsigned type)0, \ + signed type: (signed type)0 + +#define __unqual_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (x))) + #endif /* __LINUX_COMPILER_TYPES_H */ diff --git a/tools/include/vdso/unaligned.h b/tools/include/vdso/unaligned.h index ff0c06b6513e..9076483c9fbb 100644 --- a/tools/include/vdso/unaligned.h +++ b/tools/include/vdso/unaligned.h @@ -2,14 +2,43 @@ #ifndef __VDSO_UNALIGNED_H #define __VDSO_UNALIGNED_H -#define __get_unaligned_t(type, ptr) ({ \ - const struct { type x; } __packed * __get_pptr = (typeof(__get_pptr))(ptr); \ - __get_pptr->x; \ +#include + +/** + * __get_unaligned_t - read an unaligned value from memory. + * @type: the type to load from the pointer. + * @ptr: the pointer to load from. + * + * Use memcpy to affect an unaligned type sized load avoiding undefined behavior + * from approaches like type punning that require -fno-strict-aliasing in order + * to be correct. As type may be const, use __unqual_scalar_typeof to map to a + * non-const type - you can't memcpy into a const type. The + * __get_unaligned_ctrl_type gives __unqual_scalar_typeof its required + * expression rather than type, a pointer is used to avoid warnings about mixing + * the use of 0 and NULL. The void* cast silences ubsan warnings. + */ +#define __get_unaligned_t(type, ptr) ({ \ + type *__get_unaligned_ctrl_type __always_unused = NULL; \ + __unqual_scalar_typeof(*__get_unaligned_ctrl_type) __get_unaligned_val; \ + __builtin_memcpy(&__get_unaligned_val, (void *)(ptr), \ + sizeof(__get_unaligned_val)); \ + __get_unaligned_val; \ }) -#define __put_unaligned_t(type, val, ptr) do { \ - struct { type x; } __packed * __put_pptr = (typeof(__put_pptr))(ptr); \ - __put_pptr->x = (val); \ +/** + * __put_unaligned_t - write an unaligned value to memory. + * @type: the type of the value to store. + * @val: the value to store. + * @ptr: the pointer to store to. + * + * Use memcpy to affect an unaligned type sized store avoiding undefined + * behavior from approaches like type punning that require -fno-strict-aliasing + * in order to be correct. The void* cast silences ubsan warnings. + */ +#define __put_unaligned_t(type, val, ptr) do { \ + type __put_unaligned_val = (val); \ + __builtin_memcpy((void *)(ptr), &__put_unaligned_val, \ + sizeof(__put_unaligned_val)); \ } while (0) #endif /* __VDSO_UNALIGNED_H */ -- cgit v1.2.3 From 10a62a0611f5544d209446acfde5beb7b27773c7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 16 Oct 2025 13:51:26 -0700 Subject: tools headers: Remove unneeded ignoring of warnings in unaligned.h Now that get/put_unaligned() use memcpy() the -Wpacked and -Wattributes warnings don't need disabling anymore. Signed-off-by: Ian Rogers Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251016205126.2882625-5-irogers@google.com --- tools/include/linux/unaligned.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/unaligned.h b/tools/include/linux/unaligned.h index 395a4464fe73..d51ddafed138 100644 --- a/tools/include/linux/unaligned.h +++ b/tools/include/linux/unaligned.h @@ -6,9 +6,6 @@ * This is the most generic implementation of unaligned accesses * and should work almost anywhere. */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpacked" -#pragma GCC diagnostic ignored "-Wattributes" #include #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) @@ -143,6 +140,5 @@ static inline u64 get_unaligned_be48(const void *p) { return __get_unaligned_be48(p); } -#pragma GCC diagnostic pop #endif /* __LINUX_UNALIGNED_H */ -- cgit v1.2.3 From f756ed82c62aa2725757ac011710492d4cc8c7d8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 13 Jan 2026 17:14:56 +0000 Subject: KVM: selftests: Slightly simplify memstress_setup_nested() Instead of calling memstress_setup_ept_mappings() only in the first iteration in the loop, move it before the loop. The call needed to happen within the loop before commit e40e72fec0de ("KVM: selftests: Stop passing VMX metadata to TDP mapping functions"), as memstress_setup_ept_mappings() used to take in a pointer to vmx_pages and pass it into tdp_identity_map_1g() (to get the EPT root GPA). This is no longer the case, as tdp_identity_map_1g() gets the EPT root through stage2 MMU. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260113171456.2097312-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/memstress.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 86f4c5e4c430..f53414ba7103 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -110,16 +110,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc TEST_REQUIRE(kvm_cpu_has_tdp()); vm_enable_tdp(vm); + memstress_setup_ept_mappings(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); else vcpu_alloc_svm(vm, &nested_gva); - /* The EPTs are shared across vCPUs, setup the mappings once */ - if (vcpu_id == 0) - memstress_setup_ept_mappings(vm); - /* * Override the vCPU to run memstress_l1_guest_code() which will * bounce it into L2 before calling memstress_guest_code(). -- cgit v1.2.3 From 240156b25a397d3b26ef95b3f13bddb2db6038ab Mon Sep 17 00:00:00 2001 From: Manuel Hernández Méndez Date: Thu, 4 Dec 2025 16:40:38 +0000 Subject: perf vendor events riscv: Add CVA6 JSON file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch add the OpenHW Core-V CVA6 Risc-V JSON file. For more info: https://openhwfoundation.org/news/2023/11/07/openhw-group-announces-core-v-cva6-platform-project-for-risc-v-software-development-and-testing/ Signed-off-by: Manuel Hernández Méndez Reviewed-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/riscv/mapfile.csv | 1 + .../arch/riscv/openhwgroup/cva6/firmware.json | 68 ++++++++++++++++++++++ .../arch/riscv/openhwgroup/cva6/instructions.json | 47 +++++++++++++++ .../arch/riscv/openhwgroup/cva6/memory.json | 42 +++++++++++++ .../arch/riscv/openhwgroup/cva6/microarch.json | 27 +++++++++ 5 files changed, 185 insertions(+) create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv index d5eea7f9aa9a..87cfb0e0849f 100644 --- a/tools/perf/pmu-events/arch/riscv/mapfile.csv +++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv @@ -21,5 +21,6 @@ 0x489-0x8000000000000[1-6]08-0x[9b][[:xdigit:]]+,v1,sifive/p650,core 0x5b7-0x0-0x0,v1,thead/c900-legacy,core 0x5b7-0x80000000090c0d00-0x2047000,v1,thead/c900-legacy,core +0x602-0x3-0x0,v1,openhwgroup/cva6,core 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core 0x31e-0x8000000000008a45-0x[[:xdigit:]]+,v1,andes/ax45,core diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json new file mode 100644 index 000000000000..7149caec4f80 --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json @@ -0,0 +1,68 @@ +[ + { + "ArchStdEvent": "FW_MISALIGNED_LOAD" + }, + { + "ArchStdEvent": "FW_MISALIGNED_STORE" + }, + { + "ArchStdEvent": "FW_ACCESS_LOAD" + }, + { + "ArchStdEvent": "FW_ACCESS_STORE" + }, + { + "ArchStdEvent": "FW_ILLEGAL_INSN" + }, + { + "ArchStdEvent": "FW_SET_TIMER" + }, + { + "ArchStdEvent": "FW_IPI_SENT" + }, + { + "ArchStdEvent": "FW_IPI_RECEIVED" + }, + { + "ArchStdEvent": "FW_FENCE_I_SENT" + }, + { + "ArchStdEvent": "FW_FENCE_I_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_SENT" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json new file mode 100644 index 000000000000..c38f6c97cf1f --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json @@ -0,0 +1,47 @@ +[ + { + "EventName": "LOAD_INSTRUCTIONS_RETIRED", + "EventCode": "0x5", + "BriefDescription": "number of data memory load instructions retired" + }, + { + "EventName": "STORE_INSTRUCTIONS_RETIRED", + "EventCode": "0x6", + "BriefDescription": "number of data memory store instructions retired" + }, + { + "EventName": "EXCEPTIONS", + "EventCode": "0x7", + "BriefDescription": "valid exceptions encountered" + }, + { + "EventName": "EXCEPTION_HANDLER_RETURNS", + "EventCode": "0x8", + "BriefDescription": "return from an exception" + }, + { + "EventName": "BRANCH_INSTRUCTIONS_RETIRED", + "EventCode": "0x9", + "BriefDescription": "number of branch instructions encountered retired" + }, + { + "EventName": "CALL_INSTRUCTIONS_RETIRED", + "EventCode": "0xC", + "BriefDescription": "number of call instructions retired" + }, + { + "EventName": "RETURN_INSTRUCTIONS_RETIRED", + "EventCode": "0xD", + "BriefDescription": "number of return instructions retired" + }, + { + "EventName": "INTEGER_INSTRUCTIONS_RETIRED", + "EventCode": "0x14", + "BriefDescription": "number of integer instructions retired" + }, + { + "EventName": "FLOATING_POINT_INSTRUCTIONS_RETIRED", + "EventCode": "0x15", + "BriefDescription": "number of floating point instructions retired" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json new file mode 100644 index 000000000000..c4f376a0ee4e --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json @@ -0,0 +1,42 @@ +[ + { + "EventName": "L1_I_CACHE_MISSES", + "EventCode": "0x1", + "BriefDescription": "number of misses in L1 I-Cache" + }, + { + "EventName": "L1_D_CACHE_MISSES", + "EventCode": "0x2", + "BriefDescription": "number of misses in L1 D-Cache" + }, + { + "EventName": "ITLB_MISSES", + "EventCode": "0x3", + "BriefDescription": "number of misses in ITLB" + }, + { + "EventName": "DTLB_MISSES", + "EventCode": "0x4", + "BriefDescription": "number of misses in DTLB" + }, + { + "EventName": "L1_I_CACHE_ACCESSES", + "EventCode": "0x10", + "BriefDescription": "number of accesses to instruction cache" + }, + { + "EventName": "L1_D_CACHE_ACCESSES", + "EventCode": "0x11", + "BriefDescription": "number of accesses to data cache" + }, + { + "EventName": "L1_CACHE_LINE_EVICTION", + "EventCode": "0x12", + "BriefDescription": "number of data cache line eviction" + }, + { + "EventName": "ITLB_FLUSH", + "EventCode": "0x13", + "BriefDescription": "number of ITLB flushes" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json new file mode 100644 index 000000000000..104e6e8197da --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json @@ -0,0 +1,27 @@ +[ + { + "EventName": "BRANCH_MISPREDICTS", + "EventCode": "0xA", + "BriefDescription": "number of branch mispredictions" + }, + { + "EventName": "BRANCH_EXCEPTIONS", + "EventCode": "0xB", + "BriefDescription": "number of valid branch exceptions" + }, + { + "EventName": "MSB_FULL", + "EventCode": "0xE", + "BriefDescription": "scoreboard is full" + }, + { + "EventName": "INSTRUCTION_FETCH_EMPTY", + "EventCode": "0xF", + "BriefDescription": "number of invalid instructions in IF stage" + }, + { + "EventName": "PIPELINE_STALL", + "EventCode": "0x16", + "BriefDescription": "number of cycles the pipeline is stalled during read operands" + } +] -- cgit v1.2.3 From 2c3cd43d27c1148fae05b50870f970ab24464fd5 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 8 Jan 2026 13:22:17 +0530 Subject: perf vendor events amd: Add Zen 6 mapping Add a regular expression in the map file so that appropriate JSON event files are used for AMD Zen 6 processors. Restrict the regular expression for AMD Zen 5 processors to known model ranges since they also belong to Family 1Ah. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Caleb Biggers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian [ Moved this one to the front of the series to keep the tree bisectable, as per Ian Rogers suggestion ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 3d0c57198056..149bbe7abaf5 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -43,4 +43,5 @@ AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core AuthenticAMD-25-([245][[:xdigit:]]|[[:xdigit:]]),v1,amdzen3,core AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen4,core -AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen5,core +AuthenticAMD-26-([12467][[:xdigit:]]|[[:xdigit:]]),v1,amdzen5,core +AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen6,core -- cgit v1.2.3 From 2f42fb0661d9a979800a506b6a91dc3a7d1fb162 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 8 Jan 2026 13:22:14 +0530 Subject: perf vendor events amd: Add Zen 6 core events Add core events taken from Section 1.5 "Core Performance Monitor Counters" of the Performance Monitor Counters for AMD Family 1Ah Model 50h-57h Processors document available at the link below. This constitutes events which capture information on op dispatch, execution and retirement, branch prediction, L1 and L2 cache activity, TLB activity, etc. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Caleb Biggers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=309149 Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/amdzen6/branch-prediction.json | 93 ++ tools/perf/pmu-events/arch/x86/amdzen6/decode.json | 139 +++ .../pmu-events/arch/x86/amdzen6/execution.json | 192 ++++ .../arch/x86/amdzen6/floating-point.json | 1106 ++++++++++++++++++++ .../pmu-events/arch/x86/amdzen6/inst-cache.json | 120 +++ .../perf/pmu-events/arch/x86/amdzen6/l2-cache.json | 326 ++++++ .../pmu-events/arch/x86/amdzen6/load-store.json | 523 +++++++++ 7 files changed, 2499 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/decode.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/execution.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/load-store.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json b/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json new file mode 100644 index 000000000000..dd70069f68ed --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json @@ -0,0 +1,93 @@ +[ + { + "EventName": "bp_l1_tlb_miss_l2_tlb_hit", + "EventCode": "0x84", + "BriefDescription": "Instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB." + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 2M pages.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 1G pages.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for coalesced pages (16k pages created from four adjacent 4k pages).", + "UMask": "0x08" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.all", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for all page sizes.", + "UMask": "0x0f" + }, + { + "EventName": "bp_pipe_correct", + "EventCode": "0x8b", + "BriefDescription": "Branch predictor pipeline flushes due to internal conditions such as a second level prediction structure." + }, + { + "EventName": "bp_var_target_pred", + "EventCode": "0x8e", + "BriefDescription": "Indirect predictions (branch used the indirect predictor to make a prediction)." + }, + { + "EventName": "bp_early_redir", + "EventCode": "0x91", + "BriefDescription": "Early redirects sent to branch predictor. This happens when either the decoder or dispatch logic is able to detect that the branch predictor needs to be redirected." + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if4k", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 4k or coalesced pages (16k pages created from four adjacent 4k pages).", + "UMask": "0x01" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if2m", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 2M pages.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if1g", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 1G pages.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.all", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for all page sizes.", + "UMask": "0x07" + }, + { + "EventName": "bp_fe_redir.resync", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the pipeline frontend caused by resyncs. These are retire time pipeline restarts.", + "UMask": "0x01" + }, + { + "EventName": "bp_fe_redir.ex_redir", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the pipeline frontend caused by mispredicts. These are used for branch direction correction and handling indirect branch target mispredicts.", + "UMask": "0x02" + }, + { + "EventName": "bp_fe_redir.all", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the pipeline frontend caused by any reason." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/decode.json b/tools/perf/pmu-events/arch/x86/amdzen6/decode.json new file mode 100644 index 000000000000..c5d37fbac948 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/decode.json @@ -0,0 +1,139 @@ +[ + { + "EventName": "de_op_queue_empty", + "EventCode": "0xa9", + "BriefDescription": "Cycles where the op queue is empty. Such cycles indicate that the frontend is not delivering instructions fast enough." + }, + { + "EventName": "de_src_op_disp.x86_decoder", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from x86 decoder.", + "UMask": "0x01" + }, + { + "EventName": "de_src_op_disp.op_cache", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from op cache.", + "UMask": "0x02" + }, + { + "EventName": "de_src_op_disp.all", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from any source.", + "UMask": "0x07" + }, + { + "EventName": "de_dis_ops_from_decoder.any_fp", + "EventCode": "0xab", + "BriefDescription": "Ops dispatched from the decoder to a floating-point unit.", + "UMask": "0x04" + }, + { + "EventName": "de_dis_ops_from_decoder.any_int", + "EventCode": "0xab", + "BriefDescription": "Ops dispatched from the decoder to an integer unit.", + "UMask": "0x08" + }, + { + "EventName": "de_disp_stall_cycles_dynamic_tokens_part1.int_phy_reg_file_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to integer physical register file resource stalls.", + "UMask": "0x01" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.load_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to load queue token stalls.", + "UMask": "0x02" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.store_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to store queue token stalls.", + "UMask": "0x04" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.taken_brnch_buffer_rsrc", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to taken branch buffer resource stalls.", + "UMask": "0x10" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.fp_sch_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to floating-point non-schedulable queue token stalls.", + "UMask": "0x40" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq0", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 0 tokens.", + "UMask": "0x01" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq1", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 1 tokens.", + "UMask": "0x02" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq2", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 2 tokens.", + "UMask": "0x04" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq3", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 3 tokens.", + "UMask": "0x08" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq4", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 4 tokens.", + "UMask": "0x10" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq5", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 5 tokens.", + "UMask": "0x20" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ret_q", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of retire queue tokens.", + "UMask": "0x80" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.all", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to any token stalls.", + "UMask": "0xbf" + }, + { + "EventName": "de_no_dispatch_per_slot.no_ops_from_frontend", + "EventCode": "0x1a0", + "BriefDescription": "Dispatch slots in each cycle that were empty because the frontend did not supply ops.", + "UMask": "0x01" + }, + { + "EventName": "de_no_dispatch_per_slot.backend_stalls", + "EventCode": "0x1a0", + "BriefDescription": "Dispatch slots in each cycle that were unused because of backend stalls.", + "UMask": "0x1e" + }, + { + "EventName": "de_no_dispatch_per_slot.smt_contention", + "EventCode": "0x1a0", + "BriefDescription": "Dispatch slots in each cycle that were unused because the dispatch cycle was granted to the other SMT thread.", + "UMask": "0x60" + }, + { + "EventName": "de_additional_resource_stalls.dispatch_stalls", + "EventCode": "0x1a2", + "BriefDescription": "Counts additional cycles where dispatch is stalled due to a lack of dispatch resources.", + "UMask": "0x30" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/execution.json b/tools/perf/pmu-events/arch/x86/amdzen6/execution.json new file mode 100644 index 000000000000..1b80acc89b6f --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/execution.json @@ -0,0 +1,192 @@ +[ + { + "EventName": "ex_ret_instr", + "EventCode": "0xc0", + "BriefDescription": "Retired instructions." + }, + { + "EventName": "ex_ret_ops", + "EventCode": "0xc1", + "BriefDescription": "Retired macro-ops." + }, + { + "EventName": "ex_ret_brn", + "EventCode": "0xc2", + "BriefDescription": "Retired branch instructions (all types of architectural control flow changes, including exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_misp", + "EventCode": "0xc3", + "BriefDescription": "Retired branch instructions that were mispredicted." + }, + { + "EventName": "ex_ret_brn_tkn", + "EventCode": "0xc4", + "BriefDescription": "Retired taken branch instructions (all types of architectural control flow changes, including exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_tkn_misp", + "EventCode": "0xc5", + "BriefDescription": "Retired taken branch instructions that were mispredicted." + }, + { + "EventName": "ex_ret_brn_far", + "EventCode": "0xc6", + "BriefDescription": "Retired far control transfers (far call, far jump, far return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts). Far control transfers are not subject to branch prediction." + }, + { + "EventName": "ex_ret_near_ret", + "EventCode": "0xc8", + "BriefDescription": "Retired near returns (RET or RET Iw)." + }, + { + "EventName": "ex_ret_near_ret_mispred", + "EventCode": "0xc9", + "BriefDescription": "Retired near returns that were mispredicted. Each misprediction incurs the same penalty as that of a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_brn_ind_misp", + "EventCode": "0xca", + "BriefDescription": "Retired indirect branch instructions that were mispredicted (only EX mispredicts). Each misprediction incurs the same penalty as that of a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_brn_ind", + "EventCode": "0xcc", + "BriefDescription": "Retired indirect branch instructions." + }, + { + "EventName": "ex_ret_brn_cond", + "EventCode": "0xd1", + "BriefDescription": "Retired conditional branch instructions." + }, + { + "EventName": "ex_div_busy", + "EventCode": "0xd3", + "BriefDescription": "Cycles where the divider is busy." + }, + { + "EventName": "ex_div_count", + "EventCode": "0xd4", + "BriefDescription": "Divide ops executed." + }, + { + "EventName": "ex_no_retire.empty", + "EventCode": "0xd6", + "BriefDescription": "Cycles where the thread does not retire any ops due to a lack of valid ops in the retire queue (may be caused by front-end bottlenecks or pipeline redirects).", + "UMask": "0x01" + }, + { + "EventName": "ex_no_retire.not_complete", + "EventCode": "0xd6", + "BriefDescription": "Cycles where the thread does not retire any ops as the oldest retire slot is waiting to be marked as completed.", + "UMask": "0x02" + }, + { + "EventName": "ex_no_retire.other", + "EventCode": "0xd6", + "BriefDescription": "Cycles where the thread does not retire any ops due to other reasons (retire breaks, traps, faults, etc.).", + "UMask": "0x08" + }, + { + "EventName": "ex_no_retire.thread_not_selected", + "EventCode": "0xd6", + "BriefDescription": "Cycles where the thread does not retire any ops as thread arbitration did not select the current thread.", + "UMask": "0x10" + }, + { + "EventName": "ex_no_retire.load_not_complete", + "EventCode": "0xd6", + "BriefDescription": "Cycles where the thread does not retire any ops due to missing load completion.", + "UMask": "0xa2" + }, + { + "EventName": "ex_ret_ucode_instr", + "EventCode": "0x1c1", + "BriefDescription": "Retired microcoded instructions." + }, + { + "EventName": "ex_ret_ucode_ops", + "EventCode": "0x1c2", + "BriefDescription": "Retired microcode ops." + }, + { + "EventName": "ex_ret_brn_cond_misp", + "EventCode": "0x1c7", + "BriefDescription": "Retired conditional branch instructions that were mispredicted due to direction mismatch." + }, + { + "EventName": "ex_ret_brn_uncond_ind_near_misp", + "EventCode": "0x1c8", + "BriefDescription": "Retired unconditional indirect near branch instructions that were mispredicted." + }, + { + "EventName": "ex_ret_brn_uncond", + "EventCode": "0x1c9", + "BriefDescription": "Retired unconditional branch instructions." + }, + { + "EventName": "ex_tagged_ibs_ops.tagged", + "EventCode": "0x1cf", + "BriefDescription": "Execution IBS tagged ops.", + "UMask": "0x01" + }, + { + "EventName": "ex_tagged_ibs_ops.tagged_ret", + "EventCode": "0x1cf", + "BriefDescription": "Execution IBS tagged ops that retired.", + "UMask": "0x02" + }, + { + "EventName": "ex_tagged_ibs_ops.rollovers", + "EventCode": "0x1cf", + "BriefDescription": "Execution IBS periodic counter rollovers due to a previous tagged op not being IBS complete.", + "UMask": "0x04" + }, + { + "EventName": "ex_tagged_ibs_ops.filtered", + "EventCode": "0x1cf", + "BriefDescription": "Execution IBS tagged ops that retired but were discarded due to IBS filtering.", + "UMask": "0x08" + }, + { + "EventName": "ex_tagged_ibs_ops.valid", + "EventCode": "0x1cf", + "BriefDescription": "Execution IBS tagged ops that resulted in a valid sample and an IBS interrupt.", + "UMask": "0x10" + }, + { + "EventName": "ex_ret_fused_instr", + "EventCode": "0x1d0", + "BriefDescription": "Retired fused instructions." + }, + { + "EventName": "ex_mprof_ibs_ops.tagged", + "EventCode": "0x2c0", + "BriefDescription": "Memory Profiler IBS tagged ops.", + "UMask": "0x01" + }, + { + "EventName": "ex_mprof_ibs_ops.tagged_ret", + "EventCode": "0x2c0", + "BriefDescription": "Memory Profiler IBS tagged ops that retired.", + "UMask": "0x02" + }, + { + "EventName": "ex_mprof_ibs_ops.rollovers", + "EventCode": "0x2c0", + "BriefDescription": "Memory Profiler IBS periodic counter rollovers due to a previous tagged op not being IBS complete.", + "UMask": "0x04" + }, + { + "EventName": "ex_mprof_ibs_ops.filtered", + "EventCode": "0x2c0", + "BriefDescription": "Memory Profiler IBS tagged ops that retired but were discarded due to IBS filtering.", + "UMask": "0x08" + }, + { + "EventName": "ex_mprof_ibs_ops.valid", + "EventCode": "0x2c0", + "BriefDescription": "Memory Profiler IBS tagged ops that resulted in a valid sample and an IBS interrupt.", + "UMask": "0x10" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json new file mode 100644 index 000000000000..03cb039434de --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json @@ -0,0 +1,1106 @@ +[ + { + "EventName": "fp_ret_x87_fp_ops.add_sub_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point add and subtract uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_x87_fp_ops.mul_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point multiply uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_x87_fp_ops.div_sqrt_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point divide and square root uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_x87_fp_ops.all", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point uops of all types.", + "UMask": "0x07" + }, + { + "EventName": "fp_ret_sse_avx_ops.add_sub_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX add and subtract FLOPs.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_sse_avx_ops.mult_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX multiply FLOPs.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_sse_avx_ops.div_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX divide and square root FLOPs.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_sse_avx_ops.mac_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX multiply-accumulate FLOPs (each operation is counted as 2 FLOPs, bfloat operations are not included).", + "UMask": "0x08" + }, + { + "EventName": "fp_ret_sse_avx_ops.bfloat16_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX bfloat16 FLOPs.", + "UMask": "0x20" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_single_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX scalar single-precision (FP32) FLOPs.", + "UMask": "0x40" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_single_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX packed single-precision (FP32) FLOPs.", + "UMask": "0x60" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_double_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX scalar double-precision (FP64) FLOPs.", + "UMask": "0x80" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_double_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX packed double-precision (FP64) FLOPs.", + "UMask": "0xa0" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_half_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX scalar half-precision (FP16) FLOPs.", + "UMask": "0xa0" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_half_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX packed half-precision (FP16) FLOPs.", + "UMask": "0xa0" + }, + { + "EventName": "fp_ret_sse_avx_ops.all", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX FLOPs of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_ops_ret_by_width.x87", + "EventCode": "0x08", + "BriefDescription": "Retired x87 floating-point uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ops_ret_by_width.mmx", + "EventCode": "0x08", + "BriefDescription": "Retired MMX floating-point uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ops_ret_by_width.scalar", + "EventCode": "0x08", + "BriefDescription": "Retired scalar floating-point uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ops_ret_by_width.pack_128", + "EventCode": "0x08", + "BriefDescription": "Retired packed 128-bit floating-point uops.", + "UMask": "0x08" + }, + { + "EventName": "fp_ops_ret_by_width.pack_256", + "EventCode": "0x08", + "BriefDescription": "Retired packed 256-bit floating-point uops.", + "UMask": "0x10" + }, + { + "EventName": "fp_ops_ret_by_width.pack_512", + "EventCode": "0x08", + "BriefDescription": "Retired packed 512-bit floating-point uops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ops_ret_by_width.all", + "EventCode": "0x08", + "BriefDescription": "Retired floating-point uops of all widths.", + "UMask": "0x3f" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_add", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point add uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_sub", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point subtract uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_mul", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point multiply uops.", + "UMask": "0x03" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_mac", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point multiply-accumulate uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_div", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point divide uops.", + "UMask": "0x05" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_sqrt", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point square root uops.", + "UMask": "0x06" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_cmp", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point compare uops.", + "UMask": "0x07" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_cvt", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point convert uops.", + "UMask": "0x08" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_blend", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point blend uops.", + "UMask": "0x09" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_move", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point move uops.", + "UMask": "0x0a" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_shuffle", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_bfloat", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point bfloat uops.", + "UMask": "0x0c" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_logical", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point move uops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_other", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point uops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_ops_ret_by_type.scalar_all", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point uops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_ops_ret_by_type.vector_add", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point add uops.", + "UMask": "0x10" + }, + { + "EventName": "fp_ops_ret_by_type.vector_sub", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point subtract uops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ops_ret_by_type.vector_mul", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point multiply uops.", + "UMask": "0x30" + }, + { + "EventName": "fp_ops_ret_by_type.vector_mac", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point multiply-accumulate uops.", + "UMask": "0x40" + }, + { + "EventName": "fp_ops_ret_by_type.vector_div", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point divide uops.", + "UMask": "0x50" + }, + { + "EventName": "fp_ops_ret_by_type.vector_sqrt", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point square root uops.", + "UMask": "0x60" + }, + { + "EventName": "fp_ops_ret_by_type.vector_cmp", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point compare uops.", + "UMask": "0x70" + }, + { + "EventName": "fp_ops_ret_by_type.vector_cvt", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point convert uops.", + "UMask": "0x80" + }, + { + "EventName": "fp_ops_ret_by_type.vector_blend", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point blend uops.", + "UMask": "0x90" + }, + { + "EventName": "fp_ops_ret_by_type.vector_move", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point move uops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_ops_ret_by_type.vector_shuffle", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_ops_ret_by_type.vector_bfloat", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point bfloat uops.", + "UMask": "0xc0" + }, + { + "EventName": "fp_ops_ret_by_type.vector_logical", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point logical uops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_ops_ret_by_type.vector_other", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point uops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_ops_ret_by_type.vector_all", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point uops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_ops_ret_by_type.all", + "EventCode": "0x0a", + "BriefDescription": "Retired floating-point uops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_add", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer add uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_sub", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer subtract uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_mul", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply uops.", + "UMask": "0x03" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_mac", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply-accumulate uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_aes", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer AES uops.", + "UMask": "0x05" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_sha", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer SHA uops.", + "UMask": "0x06" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_cmp", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer compare uops.", + "UMask": "0x07" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_cvt", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer convert or pack uops.", + "UMask": "0x08" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_shift", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer shift or rotate uops.", + "UMask": "0x09" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_mov", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer move uops.", + "UMask": "0x0a" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_shuffle", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_vnni", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer VNNI uops.", + "UMask": "0x0c" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_logical", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer logical uops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_other", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply uops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_sse_avx_ops_ret.mmx_all", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer uops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_add", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer add uops.", + "UMask": "0x10" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_sub", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer subtract uops.", + "UMask": "0x20" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_mul", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer multiply uops.", + "UMask": "0x30" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_mac", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer multiply-accumulate uops.", + "UMask": "0x40" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_aes", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer AES uops.", + "UMask": "0x50" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_sha", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer SHA uops.", + "UMask": "0x60" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_cmp", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer compare uops.", + "UMask": "0x70" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_cvt", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer convert or pack uops.", + "UMask": "0x80" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_shift", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer shift or rotate uops.", + "UMask": "0x90" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_mov", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer move uops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_shuffle", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_vnni", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer VNNI uops.", + "UMask": "0xc0" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_logical", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer logical uops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_other", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer uops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_sse_avx_ops_ret.sse_avx_all", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer uops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_sse_avx_ops_ret.all", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX, SSE and AVX integer uops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_pack_ops_ret.fp128_add", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point add uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_pack_ops_ret.fp128_sub", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point subtract uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_pack_ops_ret.fp128_mul", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point multiply uops.", + "UMask": "0x03" + }, + { + "EventName": "fp_pack_ops_ret.fp128_mac", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point multiply-accumulate uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_pack_ops_ret.fp128_div", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point divide uops.", + "UMask": "0x05" + }, + { + "EventName": "fp_pack_ops_ret.fp128_sqrt", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point square root uops.", + "UMask": "0x06" + }, + { + "EventName": "fp_pack_ops_ret.fp128_cmp", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point compare uops.", + "UMask": "0x07" + }, + { + "EventName": "fp_pack_ops_ret.fp128_cvt", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point convert uops.", + "UMask": "0x08" + }, + { + "EventName": "fp_pack_ops_ret.fp128_blend", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point blend uops.", + "UMask": "0x09" + }, + { + "EventName": "fp_pack_ops_ret.fp128_mov", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point move uops.", + "UMask": "0x0a" + }, + { + "EventName": "fp_pack_ops_ret.fp128_shuffle", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_pack_ops_ret.fp128_bfloat", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point bfloat uops.", + "UMask": "0x0c" + }, + { + "EventName": "fp_pack_ops_ret.fp128_logical", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point logical uops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_pack_ops_ret.fp128_other", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point uops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_pack_ops_ret.fp128_all", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point uops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_pack_ops_ret.fp256_add", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point add uops.", + "UMask": "0x10" + }, + { + "EventName": "fp_pack_ops_ret.fp256_sub", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point subtract uops.", + "UMask": "0x20" + }, + { + "EventName": "fp_pack_ops_ret.fp256_mul", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point multiply uops.", + "UMask": "0x30" + }, + { + "EventName": "fp_pack_ops_ret.fp256_mac", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point multiply-accumulate uops.", + "UMask": "0x40" + }, + { + "EventName": "fp_pack_ops_ret.fp256_div", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point divide uops.", + "UMask": "0x50" + }, + { + "EventName": "fp_pack_ops_ret.fp256_sqrt", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point square root uops.", + "UMask": "0x60" + }, + { + "EventName": "fp_pack_ops_ret.fp256_cmp", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point compare uops.", + "UMask": "0x70" + }, + { + "EventName": "fp_pack_ops_ret.fp256_cvt", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point convert uops.", + "UMask": "0x80" + }, + { + "EventName": "fp_pack_ops_ret.fp256_blend", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point blend uops.", + "UMask": "0x90" + }, + { + "EventName": "fp_pack_ops_ret.fp256_mov", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point move uops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_pack_ops_ret.fp256_shuffle", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_pack_ops_ret.fp256_logical", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point logical uops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_pack_ops_ret.fp256_other", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point uops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_pack_ops_ret.fp256_all", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point uops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_pack_ops_ret.fp_all", + "EventCode": "0x0c", + "BriefDescription": "Retired packed floating-point uops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_add", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer add uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_sub", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer subtract uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_mul", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer multiply uops.", + "UMask": "0x03" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_mac", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer multiply-accumulate uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_aes", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer AES uops.", + "UMask": "0x05" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_sha", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer SHA uops.", + "UMask": "0x06" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_cmp", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer compare uops.", + "UMask": "0x07" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_cvt", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer convert or pack uops.", + "UMask": "0x08" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_shift", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer shift or rotate uops.", + "UMask": "0x09" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_mov", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer move uops.", + "UMask": "0x0a" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_shuffle", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_vnni", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer VNNI ops.", + "UMask": "0x0c" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_logical", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer logical uops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_other", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer uops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_pack_int_ops_ret.int128_all", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer uops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_add", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer add uops.", + "UMask": "0x10" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_sub", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer subtract uops.", + "UMask": "0x20" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_mul", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer multiply uops.", + "UMask": "0x30" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_mac", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer multiply-accumulate uops.", + "UMask": "0x40" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_cmp", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer compare uops.", + "UMask": "0x70" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_shift", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer shift or rotate uops.", + "UMask": "0x90" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_mov", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer move uops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_shuffle", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_vnni", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer VNNI uops.", + "UMask": "0xc0" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_logical", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer logical uops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_other", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer uops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_pack_int_ops_ret.int256_all", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer uops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_pack_int_ops_ret.int_all", + "EventCode": "0x0d", + "BriefDescription": "Retired packed integer uops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_disp_faults.x87_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for x87 fills.", + "UMask": "0x01" + }, + { + "EventName": "fp_disp_faults.xmm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for XMM fills.", + "UMask": "0x02" + }, + { + "EventName": "fp_disp_faults.ymm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for YMM fills.", + "UMask": "0x04" + }, + { + "EventName": "fp_disp_faults.ymm_spill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for YMM spills.", + "UMask": "0x08" + }, + { + "EventName": "fp_disp_faults.sse_avx_all", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults of all types for SSE and AVX ops.", + "UMask": "0x0e" + }, + { + "EventName": "fp_disp_faults.all", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_add", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point add uops.", + "UMask": "0x01" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_sub", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point subtract uops.", + "UMask": "0x02" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_mul", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point multiply uops.", + "UMask": "0x03" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_mac", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point multiply-accumulate uops.", + "UMask": "0x04" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_div", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point divide uops.", + "UMask": "0x05" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_sqrt", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point square root uops.", + "UMask": "0x06" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_cmp", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point compare uops.", + "UMask": "0x07" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_cvt", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point convert uops.", + "UMask": "0x08" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_blend", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point blend uops.", + "UMask": "0x09" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_mov", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point move uops.", + "UMask": "0x0a" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_shuffle", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_bfloat", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point bfloat uops.", + "UMask": "0x0c" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_logical", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point logical uops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_other", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point uops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_pack_512b_ops_ret.fp512_all", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed floating-point uops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_add", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer add uops.", + "UMask": "0x10" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_sub", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer subtract uops.", + "UMask": "0x20" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_mul", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer multiply uops.", + "UMask": "0x30" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_mac", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer multiply-accumulate uops.", + "UMask": "0x40" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_aes", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer AES uops.", + "UMask": "0x50" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_sha", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer SHA uops.", + "UMask": "0x60" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_cmp", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer compare uops.", + "UMask": "0x70" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_cvt", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer convert or pack uops.", + "UMask": "0x80" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_shift", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer shift or rotate uops.", + "UMask": "0x90" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_mov", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer move uops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_shuffle", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_vnni", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer VNNI uops.", + "UMask": "0xc0" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_logical", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer logical uops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_other", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer uops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_pack_512b_ops_ret.int512_all", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed integer uops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_pack_512b_ops_ret.512b_all", + "EventCode": "0x0f", + "BriefDescription": "Retired 512-bit packed uops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_nsq_read_stalls.fp_prf", + "EventCode": "0x13", + "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free physical register file (FP-PRF) entries.", + "UMask": "0x0e" + }, + { + "EventName": "fp_nsq_read_stalls.k_prf", + "EventCode": "0x13", + "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free mask physical register file (K-PRF) entries.", + "UMask": "0x0e" + }, + { + "EventName": "fp_nsq_read_stalls.fp_sq", + "EventCode": "0x13", + "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free scheduler entries.", + "UMask": "0x0e" + }, + { + "EventName": "fp_nsq_read_stalls.all", + "EventCode": "0x13", + "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to any reason.", + "UMask": "0x0e" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json new file mode 100644 index 000000000000..5ab6766f8940 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json @@ -0,0 +1,120 @@ +[ + { + "EventName": "ic_cache_fill_l2", + "EventCode": "0x82", + "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from the L2 cache." + }, + { + "EventName": "ic_cache_fill_sys", + "EventCode": "0x83", + "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from system memory or another cache." + }, + { + "EventName": "ic_fetch_ibs_events.tagged", + "EventCode": "0x188", + "BriefDescription": "Fetch IBS tagged fetches. Not all tagged fetches result in a valid sample and an IBS interrupt.", + "UMask": "0x02" + }, + { + "EventName": "ic_fetch_ibs_events.filtered", + "EventCode": "0x188", + "BriefDescription": "Fetch IBS tagged fetches that were discarded due to IBS filtering.", + "UMask": "0x08" + }, + { + "EventName": "ic_fetch_ibs_events.valid", + "EventCode": "0x188", + "BriefDescription": "Fetch IBS tagged fetches that resulted in a valid sample and an IBS interrupt.", + "UMask": "0x10" + }, + { + "EventName": "op_cache_hit_miss.hit", + "EventCode": "0x28f", + "BriefDescription": "Op cache fetch hits.", + "UMask": "0x03" + }, + { + "EventName": "op_cache_hit_miss.miss", + "EventCode": "0x28f", + "BriefDescription": "Op cache fetch misses.", + "UMask": "0x04" + }, + { + "EventName": "op_cache_hit_miss.all", + "EventCode": "0x28f", + "BriefDescription": "Op cache fetches of all types.", + "UMask": "0x07" + }, + { + "EventName": "ic_fills_from_sys.local_l2", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ic_fills_from_sys.local_ccx", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ic_fills_from_sys.local_all", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ic_fills_from_sys.near_cache", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ic_fills_from_sys.dram_io_near", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ic_fills_from_sys.far_cache", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ic_fills_from_sys.remote_cache", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ic_fills_from_sys.dram_io_far", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in a different NUMA node.", + "UMask": "0x40" + }, + { + "EventName": "ic_fills_from_sys.dram_io_all", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.", + "UMask": "0x48" + }, + { + "EventName": "ic_fills_from_sys.far_all", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.", + "UMask": "0x50" + }, + { + "EventName": "ic_fills_from_sys.alt_mem", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from extension memory (CXL).", + "UMask": "0x80" + }, + { + "EventName": "ic_fills_from_sys.all", + "EventCode": "0x29c", + "BriefDescription": "Instruction cache fills where data is returned from all types of sources.", + "UMask": "0xdf" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json new file mode 100644 index 000000000000..b0b2090fb920 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json @@ -0,0 +1,326 @@ +[ + { + "EventName": "l2_request_g1.group2", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of non-cacheable type (non-cached data and instructions reads, self-modifying code checks).", + "UMask": "0x01" + }, + { + "EventName": "l2_request_g1.l2_hwpf", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests from hardware prefetchers to prefetch directly into L2 (hit or miss).", + "UMask": "0x02" + }, + { + "EventName": "l2_request_g1.prefetch_l2_cmd", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests to prefetch directly into L2.", + "UMask": "0x04" + }, + { + "EventName": "l2_request_g1.cacheable_ic_read", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests for instruction cache reads.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g1.ls_rd_blk_c_s", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests for data cache shared reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g1.rd_blk_x", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests for data cache stores.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g1.rd_blk_l", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests for data cache reads (includes hardware and software prefetches).", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g1.dc_all", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of common types from data cache (includes prefetches).", + "UMask": "0xe0" + }, + { + "EventName": "l2_request_g1.no_pf_all", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of common types not including prefetches.", + "UMask": "0xf1" + }, + { + "EventName": "l2_request_g1.all", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of all types.", + "UMask": "0xf7" + }, + { + "EventName": "l2_request_g2.ls_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests for non-coherent, non-cacheable LS sized reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g2.ls_rd_sized", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests for coherent, non-cacheable LS sized reads.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g2.all", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests of all rare types.", + "UMask": "0x40" + }, + { + "EventName": "l2_wcb_req.wcb_close", + "EventCode": "0x63", + "BriefDescription": "Write Combining Buffer (WCB) closures.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_miss", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 misses.", + "UMask": "0x01" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits on non-modifiable lines.", + "UMask": "0x02" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits on modifiable lines.", + "UMask": "0x04" + }, + { + "EventName": "l2_cache_req_stat.ic_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits.", + "UMask": "0x06" + }, + { + "EventName": "l2_cache_req_stat.ic_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 accesses.", + "UMask": "0x07" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_c", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 misses.", + "UMask": "0x08" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 misses.", + "UMask": "0x09" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) that result in data cache stores or L2 state change hits.", + "UMask": "0x10" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits on non-modifiable lines.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits on modifiable lines.", + "UMask": "0x40" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_cs", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 read hits on shared lines.", + "UMask": "0x80" + }, + { + "EventName": "l2_cache_req_stat.dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits.", + "UMask": "0xf0" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 hits.", + "UMask": "0xf6" + }, + { + "EventName": "l2_cache_req_stat.dc_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 accesses.", + "UMask": "0xf8" + }, + { + "EventName": "l2_cache_req_stat.all", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 accesses.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_hit_l2.l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_hit_l2.l1_dc_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_hit_l2.l1_dc_l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l2_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_l3.l2_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_miss_l2_l3.l1_dc_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_miss_l2_l3.l1_dc_l2_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_fill_rsp_src.local_ccx", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "l2_fill_rsp_src.near_cache", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_near", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "l2_fill_rsp_src.far_cache", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_far", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in a different NUMA node.", + "UMask": "0x40" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_all", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.", + "UMask": "0x48" + }, + { + "EventName": "l2_fill_rsp_src.far_all", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.", + "UMask": "0x50" + }, + { + "EventName": "l2_fill_rsp_src.alt_mem", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from extension memory (CXL).", + "UMask": "0x80" + }, + { + "EventName": "l2_fill_rsp_src.all", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills where data is returned from all types of sources.", + "UMask": "0xde" + }, + { + "EventName": "l2_sys_bw.local_dram_fill", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for fill events that target the same NUMA node and return from DRAM in the same NUMA node.", + "UMask": "0x01" + }, + { + "EventName": "l2_sys_bw.remote_dram_fill", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for fill events that target a different NUMA node and return from DRAM in a different NUMA node.", + "UMask": "0x02" + }, + { + "EventName": "l2_sys_bw.nt_write", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for non-temporal write events that target all NUMA nodes.", + "UMask": "0x04" + }, + { + "EventName": "l2_sys_bw.local_scm_fill", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for fill events that target the same NUMA node and return from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "l2_sys_bw.remote_scm_fill", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for fill events that target a different NUMA node and return from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20" + }, + { + "EventName": "l2_sys_bw.victim", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for cache victim events that target all NUMA nodes.", + "UMask": "0x40" + }, + { + "EventName": "l2_sys_bw.all", + "EventCode": "0x175", + "BriefDescription": "System bandwidth utilization for all types of events (total utilization).", + "UMask": "0xff" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json new file mode 100644 index 000000000000..4291eb59426f --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json @@ -0,0 +1,523 @@ +[ + { + "EventName": "ls_bad_status2.stli_other", + "EventCode": "0x24", + "BriefDescription": "Store-to-load conflicts (loads unable to complete due to a non-forwardable conflict with an older store).", + "UMask": "0x02" + }, + { + "EventName": "ls_locks.bus_lock", + "EventCode": "0x25", + "BriefDescription": "Retired lock instructions which caused a bus lock (non-cacheable or cache-misaligned lock).", + "UMask": "0x01" + }, + { + "EventName": "ls_locks.all", + "EventCode": "0x25", + "BriefDescription": "Retired lock instructions of all types.", + "UMask": "0x1f" + }, + { + "EventName": "ls_ret_cl_flush", + "EventCode": "0x26", + "BriefDescription": "Retired CLFLUSH instructions." + }, + { + "EventName": "ls_ret_cpuid", + "EventCode": "0x27", + "BriefDescription": "Retired CPUID instructions." + }, + { + "EventName": "ls_dispatch.pure_ld", + "EventCode": "0x29", + "BriefDescription": "Memory load operations dispatched to the load-store unit.", + "UMask": "0x01" + }, + { + "EventName": "ls_dispatch.pure_st", + "EventCode": "0x29", + "BriefDescription": "Memory store operations dispatched to the load-store unit.", + "UMask": "0x02" + }, + { + "EventName": "ls_dispatch.ld_st", + "EventCode": "0x29", + "BriefDescription": "Memory load-store operations (load from and store to the same memory address) dispatched to the load-store unit.", + "UMask": "0x04" + }, + { + "EventName": "ls_dispatch.all", + "EventCode": "0x29", + "BriefDescription": "Memory operations dispatched to the load-store unit of all types.", + "UMask": "0x07" + }, + { + "EventName": "ls_smi_rx", + "EventCode": "0x2b", + "BriefDescription": "System Management Interrupts (SMIs) received." + }, + { + "EventName": "ls_int_taken", + "EventCode": "0x2c", + "BriefDescription": "Interrupts taken." + }, + { + "EventName": "ls_stlf", + "EventCode": "0x35", + "BriefDescription": "Store-to-load-forward (STLF) hits." + }, + { + "EventName": "ls_st_commit_cancel.older_st_vis_dep", + "EventCode": "0x37", + "BriefDescription": "Store commits cancelled due to an older store, that the thread was waiting on to become globally visible, was unable to become globally visible.", + "UMask": "0x01" + }, + { + "EventName": "ls_mab_alloc.ls", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.", + "UMask": "0x07" + }, + { + "EventName": "ls_mab_alloc.hwpf", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.", + "UMask": "0x08" + }, + { + "EventName": "ls_mab_alloc.all", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.", + "UMask": "0x0f" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_l2", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_ccx", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_all", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ls_dmnd_fills_from_sys.near_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_near", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_dmnd_fills_from_sys.far_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_dmnd_fills_from_sys.remote_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_far", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.", + "UMask": "0x40" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_all", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.", + "UMask": "0x48" + }, + { + "EventName": "ls_dmnd_fills_from_sys.far_all", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.", + "UMask": "0x50" + }, + { + "EventName": "ls_dmnd_fills_from_sys.alt_mem", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from extension memory (CXL).", + "UMask": "0x80" + }, + { + "EventName": "ls_dmnd_fills_from_sys.all", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills where data is returned from all types of sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_any_fills_from_sys.local_l2", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_any_fills_from_sys.local_ccx", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_any_fills_from_sys.local_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ls_any_fills_from_sys.near_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_near", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_any_fills_from_sys.far_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_any_fills_from_sys.remote_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_far", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.", + "UMask": "0x40" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.", + "UMask": "0x48" + }, + { + "EventName": "ls_any_fills_from_sys.far_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from either cache of another CCX, DRAM or MMIO when the address was in a different NUMA node.", + "UMask": "0x50" + }, + { + "EventName": "ls_any_fills_from_sys.alt_mem", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from extension memory (CXL).", + "UMask": "0x80" + }, + { + "EventName": "ls_any_fills_from_sys.all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills where data is returned from all types of data sources.", + "UMask": "0xff" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for coalesced pages (16k pages created from four adjacent 4k pages).", + "UMask": "0x02" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 2M pages.", + "UMask": "0x04" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 1G pages.", + "UMask": "0x08" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 4k pages.", + "UMask": "0x10" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for coalesced pages (16k pages created from four adjacent 4k pages).", + "UMask": "0x20" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 2M pages.", + "UMask": "0x40" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 1G pages.", + "UMask": "0x80" + }, + { + "EventName": "ls_l1_d_tlb_miss.l2_miss_all", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for all page sizes.", + "UMask": "0xf0" + }, + { + "EventName": "ls_l1_d_tlb_miss.all", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses for all page sizes.", + "UMask": "0xff" + }, + { + "EventName": "ls_misal_loads.ma64", + "EventCode": "0x47", + "BriefDescription": "64B misaligned (cacheline crossing) loads.", + "UMask": "0x01" + }, + { + "EventName": "ls_misal_loads.ma4k", + "EventCode": "0x47", + "BriefDescription": "4kB misaligned (page crossing) loads.", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchT0 (move data to all cache levels), T1 (move data to all cache levels except L1) and T2 (move data to all cache levels except L1 and L2).", + "UMask": "0x01" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_w", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchW (move data to L1 cache and mark it modifiable).", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_nta", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchNTA (move data with minimum cache pollution i.e. non-temporal access).", + "UMask": "0x04" + }, + { + "EventName": "ls_pref_instr_disp.all", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of all types.", + "UMask": "0x07" + }, + { + "EventName": "wcb_close.full_line_64b", + "EventCode": "0x50", + "BriefDescription": "Events that caused a Write Combining Buffer (WCB) entry to close because all 64 bytes of the entry have been written to.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.dc_hit", + "EventCode": "0x52", + "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a data cache hit.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.mab_hit", + "EventCode": "0x52", + "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a match on an already allocated miss request (MAB).", + "UMask": "0x02" + }, + { + "EventName": "ls_inef_sw_pref.all", + "EventCode": "0x52", + "BriefDescript6ion": "Software prefetches that did not fetch data outside of the processor core for any reason.", + "UMask": "0x03" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_l2", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_ccx", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_all", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ls_sw_pf_dc_fills.near_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_near", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_sw_pf_dc_fills.far_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_sw_pf_dc_fills.remote_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_far", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.", + "UMask": "0x40" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_all", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.", + "UMask": "0x48" + }, + { + "EventName": "ls_sw_pf_dc_fills.far_all", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.", + "UMask": "0x50" + }, + { + "EventName": "ls_sw_pf_dc_fills.alt_mem", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from extension memory (CXL).", + "UMask": "0x80" + }, + { + "EventName": "ls_sw_pf_dc_fills.all", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills where data is returned from all types of data sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_l2", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_ccx", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_all", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ls_hw_pf_dc_fills.near_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_near", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_hw_pf_dc_fills.far_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_hw_pf_dc_fills.remote_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_far", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.", + "UMask": "0x40" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_all", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.", + "UMask": "0x48" + }, + { + "EventName": "ls_hw_pf_dc_fills.far_all", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.", + "UMask": "0x50" + }, + { + "EventName": "ls_hw_pf_dc_fills.alt_mem", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from extension memory (CXL).", + "UMask": "0x80" + }, + { + "EventName": "ls_hw_pf_dc_fills.all", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills where data is returned from all types of data sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_alloc_mab_count", + "EventCode": "0x5f", + "BriefDescription": "In-flight L1 data cache misses i.e. Miss Address Buffer (MAB) allocations each cycle." + }, + { + "EventName": "ls_not_halted_cyc", + "EventCode": "0x76", + "BriefDescription": "Core cycles where the thread is not in halted state." + }, + { + "EventName": "ls_tlb_flush.all", + "EventCode": "0x78", + "BriefDescription": "All TLB flushes.", + "UMask": "0xff" + }, + { + "EventName": "ls_not_halted_p0_cyc.p0_freq_cyc", + "EventCode": "0x120", + "BriefDescription": "Reference cycles (P0 frequency) where the thread is not in halted state.", + "UMask": "0x1" + } +] -- cgit v1.2.3 From de18394f8f69e4cb86e1561f3dd86e9f724b8f25 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 8 Jan 2026 13:22:15 +0530 Subject: perf vendor events amd: Add Zen 6 uncore events Add uncore events taken from Section 1.6 "L3 Cache Performance Monitor Counters" and Section 2.2 "UMC Performance Monitor Events" of the Performance Monitor Counters for AMD Family 1Ah Model 50h-57h Processors document available at the link below. This constitutes events which capture L3 cache and UMC command activity. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Caleb Biggers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=309149 Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/amdzen6/l3-cache.json | 177 +++++++++++++++++++++ .../arch/x86/amdzen6/memory-controller.json | 101 ++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json new file mode 100644 index 000000000000..9b9804317da7 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json @@ -0,0 +1,177 @@ +[ + { + "EventName": "l3_lookup_state.l3_miss", + "EventCode": "0x04", + "BriefDescription": "L3 cache misses.", + "UMask": "0x01", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.l3_hit", + "EventCode": "0x04", + "BriefDescription": "L3 cache hits.", + "UMask": "0xfe", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.all_coherent_accesses_to_l3", + "EventCode": "0x04", + "BriefDescription": "L3 cache requests for all coherent accesses.", + "UMask": "0xff", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.dram_near", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from DRAM in the same NUMA node.", + "UMask": "0x01", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.dram_far", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from DRAM in a different NUMA node.", + "UMask": "0x02", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.near_cache", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.far_cache", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x08", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.ext_near", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.ext_far", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.all", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency for L3 requests where data is returned from all types of sources.", + "UMask": "0x3f", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.dram_near", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from DRAM in the same NUMA node.", + "UMask": "0x01", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.dram_far", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from DRAM in a different NUMA node.", + "UMask": "0x02", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.near_cache", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from cache of another CCX in the same NUMA node.", + "UMask": "0x04", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.far_cache", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from cache of another CCX in a different NUMA node.", + "UMask": "0x08", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.ext_near", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.ext_far", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.all", + "EventCode": "0xad", + "BriefDescription": "Average sampled L3 requests where data is returned from all types of sources.", + "UMask": "0x3f", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json b/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json new file mode 100644 index 000000000000..649a60b09e1b --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json @@ -0,0 +1,101 @@ +[ + { + "EventName": "umc_mem_clk", + "PublicDescription": "Memory clock (MEMCLK) cycles.", + "EventCode": "0x00", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.all", + "PublicDescription": "ACTIVATE commands sent.", + "EventCode": "0x05", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.rd", + "PublicDescription": "ACTIVATE commands sent for reads.", + "EventCode": "0x05", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.wr", + "PublicDescription": "ACTIVATE commands sent for writes.", + "EventCode": "0x05", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.all", + "PublicDescription": "PRECHARGE commands sent.", + "EventCode": "0x06", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.rd", + "PublicDescription": "PRECHARGE commands sent for reads.", + "EventCode": "0x06", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.wr", + "PublicDescription": "PRECHARGE commands sent for writes.", + "EventCode": "0x06", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.all", + "PublicDescription": "CAS commands sent.", + "EventCode": "0x0a", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.rd", + "PublicDescription": "CAS commands sent for reads.", + "EventCode": "0x0a", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.wr", + "PublicDescription": "CAS commands sent for writes.", + "EventCode": "0x0a", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.all", + "PublicDescription": "Clock cycles where the data bus is utilized.", + "EventCode": "0x14", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.rd", + "PublicDescription": "Clock cycles where the data bus is utilized for reads.", + "EventCode": "0x14", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.wr", + "PublicDescription": "Clock cycles where the data bus is utilized for writes.", + "EventCode": "0x14", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + } +] -- cgit v1.2.3 From d0a3df886d777180322a254176c40fd4a4a23cbe Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 8 Jan 2026 13:22:16 +0530 Subject: perf vendor events amd: Add Zen 6 metrics Add metrics taken from Section 1.2 "Performance Measurement" of the Performance Monitor Counters for AMD Family 1Ah Model 50h-57h Processors document available at the link below. The recommended metrics are sourced from Table 1 "Guidance for Common Performance Statistics with Complex Event Selects". The pipeline utilization metrics are sourced from Table 2 "Guidance for Pipeline Utilization Analysis Statistics". These are useful for finding performance bottlenecks by analyzing activity at different stages of the pipeline. There are metric groups available for Level 1 and Level 2 analysis. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Caleb Biggers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=309149 Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/amdzen6/pipeline.json | 99 ++++++ .../pmu-events/arch/x86/amdzen6/recommended.json | 339 +++++++++++++++++++++ 2 files changed, 438 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/recommended.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json b/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json new file mode 100644 index 000000000000..48c501d8a097 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json @@ -0,0 +1,99 @@ +[ + { + "MetricName": "total_dispatch_slots", + "BriefDescription": "Total dispatch slots (up to 8 instructions can be dispatched in each cycle).", + "MetricExpr": "8 * ls_not_halted_cyc", + "ScaleUnit": "1slots" + }, + { + "MetricName": "frontend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because the frontend did not supply enough instructions/ops.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation", + "BriefDescription": "Percentage of dispatched ops that did not retire.", + "MetricExpr": "d_ratio(de_src_op_disp.all - ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because of backend stalls.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.backend_stalls, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "smt_contention", + "BriefDescription": "Percentage of dispatch slots that remained unused because the other thread was selected.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.smt_contention, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring", + "BriefDescription": "Percentage of dispatch slots used by ops that retired.", + "MetricExpr": "d_ratio(ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_latency", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a latency bottleneck in the frontend (such as instruction cache or TLB misses).", + "MetricExpr": "d_ratio((8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_bandwidth", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a bandwidth bottleneck in the frontend (such as decode or op cache fetch bandwidth).", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend - (8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation_from_mispredicts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to branch mispredicts.", + "MetricExpr": "d_ratio(bad_speculation * ex_ret_brn_misp, ex_ret_brn_misp + bp_fe_redir.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "bad_speculation_from_pipeline_restarts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to pipeline restarts (resyncs).", + "MetricExpr": "d_ratio(bad_speculation * bp_fe_redir.resync, ex_ret_brn_misp + bp_fe_redir.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound_by_memory", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls due to the memory subsystem.", + "MetricExpr": "backend_bound * d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete)", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "backend_bound_by_cpu", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls not related to the memory subsystem.", + "MetricExpr": "backend_bound * (1 - d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete))", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_fastpath", + "BriefDescription": "Percentage of dispatch slots used by fastpath ops that retired.", + "MetricExpr": "retiring * (1 - d_ratio(ex_ret_ucode_ops, ex_ret_ops))", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_microcode", + "BriefDescription": "Percentage of dispatch slots used by microcode ops that retired.", + "MetricExpr": "retiring * d_ratio(ex_ret_ucode_ops, ex_ret_ops)", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json new file mode 100644 index 000000000000..2849a8c159f6 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json @@ -0,0 +1,339 @@ +[ + { + "MetricName": "branch_misprediction_rate", + "BriefDescription": "Execution-time branch misprediction rate (non-speculative).", + "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)", + "MetricGroup": "branch_prediction", + "ScaleUnit": "1per_branch" + }, + { + "MetricName": "all_data_cache_accesses_pti", + "BriefDescription": "All data cache accesses per thousand instructions.", + "MetricExpr": "ls_dispatch.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_accesses_pti", + "BriefDescription": "All L2 cache accesses per thousand instructions.", + "MetricExpr": "(l2_request_g1.no_pf_all + l2_pf_hit_l2.l2_hwpf + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_ic_misses_pti", + "BriefDescription": "L2 cache accesses from L1 instruction cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.cacheable_ic_read / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_dc_misses_pti", + "BriefDescription": "L2 cache accesses from L1 data cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.dc_all / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache accesses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_hit_l2.l1_dc_l2_hwpf + l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_misses_pti", + "BriefDescription": "All L2 cache misses per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache misses from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_fill_miss / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache misses from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ls_rd_blk_c / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache misses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_hits_pti", + "BriefDescription": "All L2 cache hits per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache hits from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache hits from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.dc_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l2_hwpf_pti", + "BriefDescription": "L2 cache hits from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "l2_pf_hit_l2.l1_dc_l2_hwpf / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l3_cache_accesses", + "BriefDescription": "L3 cache accesses.", + "MetricExpr": "l3_lookup_state.all_coherent_accesses_to_l3", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_misses", + "BriefDescription": "L3 misses (including cacheline state change requests).", + "MetricExpr": "l3_lookup_state.l3_miss", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_read_miss_latency", + "BriefDescription": "Average L3 read miss latency (in core clocks).", + "MetricExpr": "(l3_xi_sampled_latency.all * 10) / l3_xi_sampled_latency_requests.all", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_local_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for local DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_near * 10) / l3_xi_sampled_latency_requests.dram_near", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_remote_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for remote DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_far * 10) / l3_xi_sampled_latency_requests.dram_far", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "op_cache_fetch_miss_ratio", + "BriefDescription": "Op cache miss ratio for all fetches.", + "MetricExpr": "d_ratio(op_cache_hit_miss.miss, op_cache_hit_miss.all)", + "ScaleUnit": "100%" + }, + { + "MetricName": "l1_data_cache_fills_from_memory_pti", + "BriefDescription": "L1 data cache fills from DRAM or MMIO in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.dram_io_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_remote_node_pti", + "BriefDescription": "L1 data cache fills from a different NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.far_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.local_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_different_ccx_pti", + "BriefDescription": "L1 data cache fills from another CCX cache in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.remote_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l1_data_cache_fills_pti", + "BriefDescription": "All L1 data cache fills per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_local_l2_pti", + "BriefDescription": "L1 demand data cache fills from local L2 cache per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_l2 / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 demand data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_ccx / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.near_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_near / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.far_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_far / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_itlb_misses_pti", + "BriefDescription": "L1 instruction TLB misses per thousand instructions.", + "MetricExpr": "(bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss.all) / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_itlb_misses_pti", + "BriefDescription": "L2 instruction TLB misses and instruction page walks per thousand instructions.", + "MetricExpr": "bp_l1_tlb_miss_l2_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_dtlb_misses_pti", + "BriefDescription": "L1 data TLB misses per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_dtlb_misses_pti", + "BriefDescription": "L2 data TLB misses and data page walks per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.l2_miss_all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_tlbs_flushed_pti", + "BriefDescription": "All TLBs flushed per thousand instructions.", + "MetricExpr": "ls_tlb_flush.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "macro_ops_dispatched", + "BriefDescription": "Macro-ops dispatched.", + "MetricExpr": "de_src_op_disp.all", + "MetricGroup": "decoder" + }, + { + "MetricName": "sse_avx_stalls", + "BriefDescription": "Mixed SSE/AVX stalls.", + "MetricExpr": "fp_disp_faults.sse_avx_all" + }, + { + "MetricName": "macro_ops_retired", + "BriefDescription": "Macro-ops retired.", + "MetricExpr": "ex_ret_ops" + }, + { + "MetricName": "umc_data_bus_utilization", + "BriefDescription": "Memory controller data bus utilization.", + "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_rate", + "BriefDescription": "Memory controller CAS command rate.", + "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_cas_cmd_read_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for reads.", + "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_write_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for writes.", + "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_mem_read_bandwidth", + "BriefDescription": "Estimated memory read bandwidth.", + "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_write_bandwidth", + "BriefDescription": "Estimated memory write bandwidth.", + "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_bandwidth", + "BriefDescription": "Estimated combined memory bandwidth.", + "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_activate_cmd_rate", + "BriefDescription": "Memory controller ACTIVATE command rate.", + "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_precharge_cmd_rate", + "BriefDescription": "Memory controller PRECHARGE command rate.", + "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + } +] -- cgit v1.2.3 From 47d3545faeeb6822f404ddb237985e1824a8bd70 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 4 Dec 2025 13:11:43 -0800 Subject: perf help: Move common_cmds into builtin-help There's a lot of infrastructure for generating a relatively simple array used by one function. Move the array into the function and remove the supporting build logic. At the same time opportunistically const-ify the array. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Charlie Jenkins Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 11 ++---- tools/perf/builtin-help.c | 51 ++++++++++++++++++++++++--- tools/perf/command-list.txt | 36 ------------------- tools/perf/util/Build | 14 -------- tools/perf/util/generate-cmdlist.sh | 70 ------------------------------------- 5 files changed, 49 insertions(+), 133 deletions(-) delete mode 100644 tools/perf/command-list.txt delete mode 100755 tools/perf/util/generate-cmdlist.sh (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index e6895626c187..45d5a59a02cb 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -808,11 +808,6 @@ $(GTK_IN): FORCE prepare $(OUTPUT)libperf-gtk.so: $(GTK_IN) $(PERFLIBS) $(QUIET_LINK)$(CC) -o $@ -shared $(LDFLAGS) $(filter %.o,$^) $(GTK_LIBS) -$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt - -$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt) - $(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@ - $(SCRIPTS) : % : %.sh $(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@' @@ -850,7 +845,7 @@ endif __build-dir = $(subst $(OUTPUT),,$(dir $@)) build-dir = $(or $(__build-dir),.) -prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \ +prepare: $(OUTPUT)PERF-VERSION-FILE archheaders \ arm64-sysreg-defs \ $(syscall_array) \ $(fs_at_flags_array) \ @@ -1054,7 +1049,7 @@ cscope: # However, the environment gets quite big, and some programs have problems # with that. -check: $(OUTPUT)common-cmds.h +check: prepare if sparse; \ then \ for i in *.c */*.c; \ @@ -1297,7 +1292,7 @@ clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $( $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 \ perf-read-vdsox32 $(OUTPUT)$(LIBJVMTI).so $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo \ - $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE \ + TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE \ $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \ $(OUTPUT)util/intel-pt-decoder/inat-tables.c \ $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index 7be6fb6df595..2692b2e40a23 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -9,7 +9,6 @@ #include "util/strbuf.h" #include "builtin.h" #include -#include "common-cmds.h" #include #include #include @@ -301,16 +300,58 @@ static struct cmdnames main_cmds, other_cmds; void list_common_cmds_help(void) { - unsigned int i, longest = 0; + const struct cmdname_help { + const char *name; + const char *help; + } common_cmds[] = { + {"annotate", "Read perf.data (created by perf record) and display annotated code"}, + {"archive", + "Create archive with object files with build-ids found in perf.data file"}, + {"bench", "General framework for benchmark suites"}, + {"buildid-cache", "Manage build-id cache."}, + {"buildid-list", "List the buildids in a perf.data file"}, + {"c2c", "Shared Data C2C/HITM Analyzer."}, + {"config", "Get and set variables in a configuration file."}, + {"daemon", "Run record sessions on background"}, + {"data", "Data file related processing"}, + {"diff", "Read perf.data files and display the differential profile"}, + {"evlist", "List the event names in a perf.data file"}, + {"ftrace", "simple wrapper for kernel's ftrace functionality"}, + {"inject", "Filter to augment the events stream with additional information"}, + {"iostat", "Show I/O performance metrics"}, + {"kallsyms", "Searches running kernel for symbols"}, + {"kvm", "Tool to trace/measure kvm guest os"}, + {"list", "List all symbolic event types"}, + {"mem", "Profile memory accesses"}, + {"record", "Run a command and record its profile into perf.data"}, + {"report", "Read perf.data (created by perf record) and display the profile"}, + {"script", "Read perf.data (created by perf record) and display trace output"}, + {"stat", "Run a command and gather performance counter statistics"}, + {"test", "Runs sanity tests."}, + {"top", "System profiling tool."}, + {"version", "display the version of perf binary"}, + #ifdef HAVE_LIBELF_SUPPORT + {"probe", "Define new dynamic tracepoints"}, + #endif /* HAVE_LIBELF_SUPPORT */ + #ifdef HAVE_LIBTRACEEVENT + {"trace", "strace inspired tool"}, + {"kmem", "Tool to trace/measure kernel memory properties"}, + {"kwork", "Tool to trace/measure kernel work properties (latencies)"}, + {"lock", "Analyze lock events"}, + {"sched", "Tool to trace/measure scheduler properties (latencies)"}, + {"timechart", "Tool to visualize total system behavior during a workload"}, + #endif /* HAVE_LIBTRACEEVENT */ + }; + size_t longest = 0; - for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { + for (size_t i = 0; i < ARRAY_SIZE(common_cmds); i++) { if (longest < strlen(common_cmds[i].name)) longest = strlen(common_cmds[i].name); } puts(" The most commonly used perf commands are:"); - for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { - printf(" %-*s ", longest, common_cmds[i].name); + for (size_t i = 0; i < ARRAY_SIZE(common_cmds); i++) { + printf(" %-*s ", (int)longest, common_cmds[i].name); puts(common_cmds[i].help); } } diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt deleted file mode 100644 index e8d2762adade..000000000000 --- a/tools/perf/command-list.txt +++ /dev/null @@ -1,36 +0,0 @@ -# -# List of known perf commands. -# command name category [deprecated] [common] -# -perf-annotate mainporcelain common -perf-archive mainporcelain common -perf-bench mainporcelain common -perf-buildid-cache mainporcelain common -perf-buildid-list mainporcelain common -perf-data mainporcelain common -perf-diff mainporcelain common -perf-c2c mainporcelain common -perf-config mainporcelain common -perf-evlist mainporcelain common -perf-ftrace mainporcelain common -perf-inject mainporcelain common -perf-iostat mainporcelain common -perf-kallsyms mainporcelain common -perf-kmem mainporcelain traceevent -perf-kvm mainporcelain common -perf-kwork mainporcelain traceevent -perf-list mainporcelain common -perf-lock mainporcelain traceevent -perf-mem mainporcelain common -perf-probe mainporcelain full -perf-record mainporcelain common -perf-report mainporcelain common -perf-sched mainporcelain traceevent -perf-script mainporcelain common -perf-stat mainporcelain common -perf-test mainporcelain common -perf-timechart mainporcelain traceevent -perf-top mainporcelain common -perf-trace mainporcelain audit -perf-version mainporcelain common -perf-daemon mainporcelain common diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 248ad3ac64da..4915f237ba9e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -419,20 +419,6 @@ $(OUTPUT)util/list_sort.o: ../lib/list_sort.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) -ifdef SHELLCHECK - SHELL_TESTS := generate-cmdlist.sh - SHELL_TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) -else - SHELL_TESTS := - SHELL_TEST_LOGS := -endif - -$(OUTPUT)%.shellcheck_log: % - $(call rule_mkdir) - $(Q)$(call echo-cmd,test)$(SHELLCHECK) "$<" > $@ || (cat $@ && rm $@ && false) - -perf-util-y += $(SHELL_TEST_LOGS) - PY_TESTS := setup.py ifdef MYPY MYPY_TEST_LOGS := $(PY_TESTS:%=%.mypy_log) diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh deleted file mode 100755 index 6a73c903d690..000000000000 --- a/tools/perf/util/generate-cmdlist.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -echo "/* Automatically generated by $0 */ -struct cmdname_help -{ - char name[16]; - char help[80]; -}; - -static struct cmdname_help common_cmds[] = {" - -sed -n -e 's/^perf-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt | -sort | -while read cmd -do - sed -n ' - /^NAME/,/perf-'"$cmd"'/H - ${ - x - s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ - p - }' "Documentation/perf-$cmd.txt" -done - -echo "#ifdef HAVE_LIBELF_SUPPORT" -sed -n -e 's/^perf-\([^ ]*\)[ ].* full.*/\1/p' command-list.txt | -sort | -while read cmd -do - sed -n ' - /^NAME/,/perf-'"$cmd"'/H - ${ - x - s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ - p - }' "Documentation/perf-$cmd.txt" -done -echo "#endif /* HAVE_LIBELF_SUPPORT */" - -echo "#if defined(HAVE_LIBTRACEEVENT)" -sed -n -e 's/^perf-\([^ ]*\)[ ].* audit*/\1/p' command-list.txt | -sort | -while read cmd -do - sed -n ' - /^NAME/,/perf-'"$cmd"'/H - ${ - x - s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ - p - }' "Documentation/perf-$cmd.txt" -done -echo "#endif /* HAVE_LIBTRACEEVENT */" - -echo "#ifdef HAVE_LIBTRACEEVENT" -sed -n -e 's/^perf-\([^ ]*\)[ ].* traceevent.*/\1/p' command-list.txt | -sort | -while read cmd -do - sed -n ' - /^NAME/,/perf-'"$cmd"'/H - ${ - x - s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ - p - }' "Documentation/perf-$cmd.txt" -done -echo "#endif /* HAVE_LIBTRACEEVENT */" -echo "};" -- cgit v1.2.3 From bac74dcbd48b5b441e47841fd0fe507c7b0bcbaf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 19 Nov 2025 15:36:21 -0800 Subject: perf tools: Switch printf("...%s", strerror(errno)) to printf("...%m") strerror() has thread safety issues, strerror_r() requires stack allocated buffers. Code in perf has already been using the "%m" formatting flag that is a widely support glibc extension to print the current errno's description. Expand the usage of this formatting flag and remove usage of strerror()/strerror_r(). Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Blake Jones Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Peter Zijlstra Cc: Stephen Brennan Cc: Thomas Falcon Cc: Yunseong Kim Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/bp-modify.c | 30 ++++++++++++------------------ tools/perf/bench/uprobe.c | 2 +- tools/perf/builtin-daemon.c | 8 +++----- tools/perf/builtin-probe.c | 3 +-- tools/perf/builtin-record.c | 24 ++++++++++-------------- tools/perf/builtin-stat.c | 9 ++++----- tools/perf/builtin-trace.c | 15 ++++++--------- tools/perf/perf.c | 18 +++++++----------- tools/perf/util/bpf-event.c | 11 ++++------- tools/perf/util/bpf-utils.c | 4 ++-- tools/perf/util/bpf_lock_contention.c | 2 +- tools/perf/util/cap.c | 3 +-- tools/perf/util/data.c | 29 ++++++++++------------------- tools/perf/util/dso.c | 19 ++++++------------- tools/perf/util/evlist.c | 31 +++++++++++++++++-------------- tools/perf/util/evsel.c | 17 +++++++++-------- tools/perf/util/jitdump.c | 3 ++- tools/perf/util/lzma.c | 6 +++--- tools/perf/util/session.c | 5 +++-- tools/perf/util/symbol-elf.c | 4 ++-- 20 files changed, 104 insertions(+), 139 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c index 0924ccd9e36d..589b43273948 100644 --- a/tools/perf/arch/x86/tests/bp-modify.c +++ b/tools/perf/arch/x86/tests/bp-modify.c @@ -80,26 +80,24 @@ static int bp_modify1(void) */ if (ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[0]), bp_2)) { - pr_debug("failed to set breakpoint, 1st time: %s\n", - strerror(errno)); + pr_debug("failed to set breakpoint, 1st time: %m\n"); goto out; } if (ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[0]), bp_1)) { - pr_debug("failed to set breakpoint, 2nd time: %s\n", - strerror(errno)); + pr_debug("failed to set breakpoint, 2nd time: %m\n"); goto out; } if (ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[7]), dr7)) { - pr_debug("failed to set dr7: %s\n", strerror(errno)); + pr_debug("failed to set dr7: %m\n"); goto out; } if (ptrace(PTRACE_CONT, child, NULL, NULL)) { - pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno)); + pr_debug("failed to PTRACE_CONT: %m\n"); goto out; } @@ -112,19 +110,17 @@ static int bp_modify1(void) rip = ptrace(PTRACE_PEEKUSER, child, offsetof(struct user_regs_struct, rip), NULL); if (rip == (unsigned long) -1) { - pr_debug("failed to PTRACE_PEEKUSER: %s\n", - strerror(errno)); + pr_debug("failed to PTRACE_PEEKUSER: %m\n"); goto out; } pr_debug("rip %lx, bp_1 %p\n", rip, bp_1); - out: if (ptrace(PTRACE_DETACH, child, NULL, NULL)) { - pr_debug("failed to PTRACE_DETACH: %s", strerror(errno)); + pr_debug("failed to PTRACE_DETACH: %m\n"); return TEST_FAIL; - } + } return rip == (unsigned long) bp_1 ? TEST_OK : TEST_FAIL; } @@ -157,14 +153,13 @@ static int bp_modify2(void) */ if (ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[0]), bp_1)) { - pr_debug("failed to set breakpoint: %s\n", - strerror(errno)); + pr_debug("failed to set breakpoint: %m\n"); goto out; } if (ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[7]), dr7)) { - pr_debug("failed to set dr7: %s\n", strerror(errno)); + pr_debug("failed to set dr7: %m\n"); goto out; } @@ -175,7 +170,7 @@ static int bp_modify2(void) } if (ptrace(PTRACE_CONT, child, NULL, NULL)) { - pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno)); + pr_debug("failed to PTRACE_CONT: %m\n"); goto out; } @@ -188,8 +183,7 @@ static int bp_modify2(void) rip = ptrace(PTRACE_PEEKUSER, child, offsetof(struct user_regs_struct, rip), NULL); if (rip == (unsigned long) -1) { - pr_debug("failed to PTRACE_PEEKUSER: %s\n", - strerror(errno)); + pr_debug("failed to PTRACE_PEEKUSER: %m\n"); goto out; } @@ -197,7 +191,7 @@ static int bp_modify2(void) out: if (ptrace(PTRACE_DETACH, child, NULL, NULL)) { - pr_debug("failed to PTRACE_DETACH: %s", strerror(errno)); + pr_debug("failed to PTRACE_DETACH: %m\n"); return TEST_FAIL; } diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c index 0b90275862e1..c4dac868f1ee 100644 --- a/tools/perf/bench/uprobe.c +++ b/tools/perf/bench/uprobe.c @@ -54,7 +54,7 @@ static const char * const bench_uprobe_usage[] = { /*opts=*/&uprobe_opts); \ if (!skel->links.prog) { \ err = -errno; \ - fprintf(stderr, "Failed to attach bench uprobe \"%s\": %s\n", #prog, strerror(errno)); \ + fprintf(stderr, "Failed to attach bench uprobe \"%s\": %m\n", #prog); \ goto cleanup; \ } diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index f0568431fbd5..33473e071392 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -265,8 +265,7 @@ static int check_base(struct daemon *daemon) daemon->base); return -EACCES; default: - pr_err("failed: can't access base '%s': %s\n", - daemon->base, strerror(errno)); + pr_err("failed: can't access base '%s': %m\n", daemon->base); return -errno; } } @@ -544,8 +543,7 @@ static int daemon_session__control(struct daemon_session *session, err = writen(control, msg, len); if (err != len) { - pr_err("failed: write to control pipe: %d (%s)\n", - errno, control_path); + pr_err("failed: write to control pipe: %m (%s)\n", control_path); goto out; } @@ -586,7 +584,7 @@ static int setup_server_socket(struct daemon *daemon) int fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd < 0) { - fprintf(stderr, "socket: %s\n", strerror(errno)); + fprintf(stderr, "socket: %m\n"); return -1; } diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 69800e4d9530..1b4ba85ee019 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -211,8 +211,7 @@ static int opt_set_target_ns(const struct option *opt __maybe_unused, ns_pid = (pid_t)strtol(str, NULL, 10); if (errno != 0) { ret = -errno; - pr_warning("Failed to parse %s as a pid: %s\n", str, - strerror(errno)); + pr_warning("Failed to parse %s as a pid: %m\n", str); return ret; } nsip = nsinfo__new(ns_pid); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index cbfbd9bb1063..003e47a4fc1d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1286,7 +1286,6 @@ static int record__mmap_evlist(struct record *rec, struct record_opts *opts = &rec->opts; bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || opts->auxtrace_sample_mode; - char msg[512]; if (opts->affinity != PERF_AFFINITY_SYS) cpu__setup_cpunode_map(); @@ -1305,8 +1304,7 @@ static int record__mmap_evlist(struct record *rec, opts->mmap_pages, opts->auxtrace_mmap_pages); return -errno; } else { - pr_err("failed to mmap with %d (%s)\n", errno, - str_error_r(errno, msg, sizeof(msg))); + pr_err("failed to mmap: %m\n"); if (errno) return -errno; else @@ -1324,7 +1322,8 @@ static int record__mmap_evlist(struct record *rec, if (record__threads_enabled(rec)) { ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps); if (ret) { - pr_err("Failed to create data directory: %s\n", strerror(-ret)); + errno = -ret; + pr_err("Failed to create data directory: %m\n"); return ret; } for (i = 0; i < evlist->core.nr_mmaps; i++) { @@ -1461,9 +1460,8 @@ try_again: } if (evlist__apply_filters(evlist, &pos, &opts->target)) { - pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", - pos->filter ?: "BPF", evsel__name(pos), errno, - str_error_r(errno, msg, sizeof(msg))); + pr_err("failed to set filter \"%s\" on event %s: %m\n", + pos->filter ?: "BPF", evsel__name(pos)); rc = -1; goto out; } @@ -1748,8 +1746,7 @@ static void *record__thread(void *arg) err = write(thread->pipes.ack[1], &msg, sizeof(msg)); if (err == -1) - pr_warning("threads[%d]: failed to notify on start: %s\n", - thread->tid, strerror(errno)); + pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid); pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu()); @@ -1792,8 +1789,7 @@ static void *record__thread(void *arg) err = write(thread->pipes.ack[1], &msg, sizeof(msg)); if (err == -1) - pr_warning("threads[%d]: failed to notify on termination: %s\n", - thread->tid, strerror(errno)); + pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid); return NULL; } @@ -2338,7 +2334,7 @@ static int record__start_threads(struct record *rec) sigfillset(&full); if (sigprocmask(SIG_SETMASK, &full, &mask)) { - pr_err("Failed to block signals on threads start: %s\n", strerror(errno)); + pr_err("Failed to block signals on threads start: %m\n"); return -1; } @@ -2356,7 +2352,7 @@ static int record__start_threads(struct record *rec) if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) { for (tt = 1; tt < t; tt++) record__terminate_thread(&thread_data[t]); - pr_err("Failed to start threads: %s\n", strerror(errno)); + pr_err("Failed to start threads: %m\n"); ret = -1; goto out_err; } @@ -2379,7 +2375,7 @@ out_err: pthread_attr_destroy(&attrs); if (sigprocmask(SIG_SETMASK, &mask, NULL)) { - pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno)); + pr_err("Failed to unblock signals on threads start: %m\n"); ret = -1; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ab40d85fb125..2895b809607f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -937,9 +937,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) } if (evlist__apply_filters(evsel_list, &counter, &target)) { - pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", - counter->filter, evsel__name(counter), errno, - str_error_r(errno, msg, sizeof(msg))); + pr_err("failed to set filter \"%s\" on event %s: %m\n", + counter->filter, evsel__name(counter)); return -1; } @@ -1001,8 +1000,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) } if (workload_exec_errno) { - const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); - pr_err("Workload failed: %s\n", emsg); + errno = workload_exec_errno; + pr_err("Workload failed: %m\n"); err = -1; goto err_out; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d49c1ae409d7..58a32adafddf 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2616,12 +2616,10 @@ static struct syscall *trace__syscall_info(struct trace *trace, struct evsel *ev err = syscall__read_info(sc, trace); if (err && verbose > 0) { - char sbuf[STRERR_BUFSIZE]; - - fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, - str_error_r(-err, sbuf, sizeof(sbuf))); + errno = -err; + fprintf(trace->output, "Problems reading syscall %d: %m", id); if (sc && sc->name) - fprintf(trace->output, "(%s)", sc->name); + fprintf(trace->output, " (%s)", sc->name); fputs(" information\n", trace->output); } return err ? NULL : sc; @@ -4673,9 +4671,8 @@ out_error: out_error_apply_filters: fprintf(trace->output, - "Failed to set filter \"%s\" on event %s with %d (%s)\n", - evsel->filter, evsel__name(evsel), errno, - str_error_r(errno, errbuf, sizeof(errbuf))); + "Failed to set filter \"%s\" on event %s: %m\n", + evsel->filter, evsel__name(evsel)); goto out_delete_evlist; } out_error_mem: @@ -4683,7 +4680,7 @@ out_error_mem: goto out_delete_evlist; out_errno: - fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); + fprintf(trace->output, "%m\n"); goto out_delete_evlist; } diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 88c60ecf3395..f475a8664ffc 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -169,8 +169,8 @@ static int set_debug_file(const char *path) { debug_fp = fopen(path, "w"); if (!debug_fp) { - fprintf(stderr, "Open debug file '%s' failed: %s\n", - path, strerror(errno)); + fprintf(stderr, "Open debug file '%s' failed: %m\n", + path); return -1; } @@ -335,7 +335,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) { int status; struct stat st; - char sbuf[STRERR_BUFSIZE]; if (use_browser == -1) use_browser = check_browser_config(p->cmd); @@ -363,17 +362,15 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) status = 1; /* Check for ENOSPC and EIO errors.. */ if (fflush(stdout)) { - fprintf(stderr, "write failure on standard output: %s", - str_error_r(errno, sbuf, sizeof(sbuf))); + fprintf(stderr, "write failure on standard output: %m\n"); goto out; } if (ferror(stdout)) { - fprintf(stderr, "unknown write failure on standard output"); + fprintf(stderr, "unknown write failure on standard output\n"); goto out; } if (fclose(stdout)) { - fprintf(stderr, "close failed on standard output: %s", - str_error_r(errno, sbuf, sizeof(sbuf))); + fprintf(stderr, "close failed on standard output: %m\n"); goto out; } status = 0; @@ -459,7 +456,6 @@ int main(int argc, const char **argv) { int err, done_help = 0; const char *cmd; - char sbuf[STRERR_BUFSIZE]; perf_debug_setup(); @@ -573,8 +569,8 @@ int main(int argc, const char **argv) } if (cmd) { - fprintf(stderr, "Failed to run command '%s': %s\n", - cmd, str_error_r(errno, sbuf, sizeof(sbuf))); + fprintf(stderr, "Failed to run command '%s': %m\n", + cmd); } out: if (debug_fp) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 2298cd396c42..2e6da3ad0a4f 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -787,11 +787,10 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, err = 0; break; } - pr_debug("%s: can't get next program: %s%s\n", - __func__, strerror(errno), - errno == EINVAL ? " -- kernel too old?" : ""); /* don't report error on old kernel or EPERM */ err = (errno == EINVAL || errno == EPERM) ? 0 : -1; + pr_debug("%s: can\'t get next program: %m%s\n", + __func__, errno == EINVAL ? " -- kernel too old?" : ""); break; } fd = bpf_prog_get_fd_by_id(id); @@ -824,10 +823,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, .tool = session->tool, }; - if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) { - pr_err("%s: failed to synthesize bpf images: %s\n", - __func__, strerror(errno)); - } + if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) + pr_err("%s: failed to synthesize bpf images: %m\n", __func__); free(event); return err; diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c index 5a66dc8594aa..d6d2c9c190f7 100644 --- a/tools/perf/util/bpf-utils.c +++ b/tools/perf/util/bpf-utils.c @@ -123,7 +123,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) /* step 1: get array dimensions */ err = bpf_obj_get_info_by_fd(fd, &info, &info_len); if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); + pr_debug("can't get prog info: %m\n"); return ERR_PTR(-EFAULT); } if (info.type >= __MAX_BPF_PROG_TYPE) @@ -186,7 +186,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) /* step 5: call syscall again to get required arrays */ err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len); if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); + pr_debug("can't get prog info: %m\n"); free(info_linear); return ERR_PTR(-EFAULT); } diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index 7b5671f13c53..788d30be2058 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -42,7 +42,7 @@ static void check_slab_cache_iter(struct lock_contention *con) con->btf = btf__load_vmlinux_btf(); if (con->btf == NULL) { - pr_debug("BTF loading failed: %s\n", strerror(errno)); + pr_debug("BTF loading failed: %m\n"); return; } diff --git a/tools/perf/util/cap.c b/tools/perf/util/cap.c index 24a0ea7e6d97..ac6d1d9a523d 100644 --- a/tools/perf/util/cap.c +++ b/tools/perf/util/cap.c @@ -28,8 +28,7 @@ bool perf_cap__capable(int cap, bool *used_root) header.version == _LINUX_CAPABILITY_VERSION_1) continue; - pr_debug2("capget syscall failed (%s - %d) fall back on root check\n", - strerror(errno), errno); + pr_debug2("capget syscall failed (%m) fall back on root check\n"); *used_root = true; return geteuid() == 0; } diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 164eb45a0b36..90df41da1a32 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -213,17 +213,15 @@ static int check_backup(struct perf_data *data) ret = rm_rf_perf_data(oldname); if (ret) { - pr_err("Can't remove old data: %s (%s)\n", - ret == -2 ? - "Unknown file found" : strerror(errno), - oldname); + if (ret == -2) + pr_err("Can't remove old data: Unknown file found (%s)\n", oldname); + else + pr_err("Can't remove old data: %m (%s)\n", oldname); return -1; } if (rename(data->path, oldname)) { - pr_err("Can't move data: %s (%s to %s)\n", - strerror(errno), - data->path, oldname); + pr_err("Can't move data: %m (%s to %s)\n", data->path, oldname); return -1; } } @@ -246,14 +244,12 @@ static int open_file_read(struct perf_data *data) int flags = data->in_place_update ? O_RDWR : O_RDONLY; struct stat st; int fd; - char sbuf[STRERR_BUFSIZE]; fd = open(data->file.path, flags); if (fd < 0) { int err = errno; - pr_err("failed to open %s: %s", data->file.path, - str_error_r(err, sbuf, sizeof(sbuf))); + pr_err("failed to open %s: %m", data->file.path); if (err == ENOENT && !strcmp(data->file.path, "perf.data")) pr_err(" (try 'perf record' first)"); pr_err("\n"); @@ -285,15 +281,10 @@ static int open_file_read(struct perf_data *data) static int open_file_write(struct perf_data *data) { - int fd; - char sbuf[STRERR_BUFSIZE]; - - fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, - S_IRUSR|S_IWUSR); + int fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, S_IRUSR|S_IWUSR); if (fd < 0) - pr_err("failed to open %s : %s\n", data->file.path, - str_error_r(errno, sbuf, sizeof(sbuf))); + pr_err("failed to open %s : %m\n", data->file.path); return fd; } @@ -436,8 +427,8 @@ int perf_data__switch(struct perf_data *data, if (lseek(data->file.fd, pos, SEEK_SET) == (off_t)-1) { ret = -errno; - pr_debug("Failed to lseek to %zu: %s", - pos, strerror(errno)); + pr_debug("Failed to lseek to %zu: %m\n", + pos); goto out; } } diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 06980844c014..18e656712f5a 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -540,16 +540,13 @@ static void close_first_dso(void); static int do_open(char *name) EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock) { - int fd; - char sbuf[STRERR_BUFSIZE]; - do { - fd = open(name, O_RDONLY|O_CLOEXEC); + int fd = open(name, O_RDONLY|O_CLOEXEC); + if (fd >= 0) return fd; - pr_debug("dso open failed: %s\n", - str_error_r(errno, sbuf, sizeof(sbuf))); + pr_debug("dso open failed: %m\n"); if (!dso__data_open_cnt || errno != EMFILE) break; @@ -1098,7 +1095,6 @@ static int file_size(struct dso *dso, struct machine *machine) { int ret = 0; struct stat st; - char sbuf[STRERR_BUFSIZE]; mutex_lock(dso__data_open_lock()); @@ -1116,8 +1112,7 @@ static int file_size(struct dso *dso, struct machine *machine) if (fstat(dso__data(dso)->fd, &st) < 0) { ret = -errno; - pr_err("dso cache fstat failed: %s\n", - str_error_r(errno, sbuf, sizeof(sbuf))); + pr_err("dso cache fstat failed: %m\n"); dso__data(dso)->status = DSO_DATA_STATUS_ERROR; goto out; } @@ -1773,10 +1768,8 @@ int dso__strerror_load(struct dso *dso, char *buf, size_t buflen) BUG_ON(buflen == 0); if (errnum >= 0) { - const char *err = str_error_r(errnum, buf, buflen); - - if (err != buf) - scnprintf(buf, buflen, "%s", err); + errno = errnum; + scnprintf(buf, buflen, "%m"); return 0; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 649519628541..3b0d837e3046 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1614,14 +1614,14 @@ int evlist__parse_sample_timestamp(struct evlist *evlist, union perf_event *even int evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size) { int printed, value; - char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf)); switch (err) { case EACCES: case EPERM: + errno = err; printed = scnprintf(buf, size, - "Error:\t%s.\n" - "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting.", emsg); + "Error:\t%m.\n" + "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting."); value = perf_event_paranoid(); @@ -1648,16 +1648,18 @@ int evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size if (first->core.attr.sample_freq < (u64)max_freq) goto out_default; + errno = err; printed = scnprintf(buf, size, - "Error:\t%s.\n" + "Error:\t%m.\n" "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n" "Hint:\tThe current value is %d and %" PRIu64 " is being requested.", - emsg, max_freq, first->core.attr.sample_freq); + max_freq, first->core.attr.sample_freq); break; } default: out_default: - scnprintf(buf, size, "%s", emsg); + errno = err; + scnprintf(buf, size, "%m"); break; } @@ -1666,17 +1668,17 @@ out_default: int evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size) { - char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf)); int pages_attempted = evlist->core.mmap_len / 1024, pages_max_per_user, printed = 0; switch (err) { case EPERM: sysctl__read_int("kernel/perf_event_mlock_kb", &pages_max_per_user); + errno = err; printed += scnprintf(buf + printed, size - printed, - "Error:\t%s.\n" + "Error:\t%m.\n" "Hint:\tCheck /proc/sys/kernel/perf_event_mlock_kb (%d kB) setting.\n" "Hint:\tTried using %zd kB.\n", - emsg, pages_max_per_user, pages_attempted); + pages_max_per_user, pages_attempted); if (pages_attempted >= pages_max_per_user) { printed += scnprintf(buf + printed, size - printed, @@ -1688,7 +1690,8 @@ int evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size "Hint:\tTry using a smaller -m/--mmap-pages value."); break; default: - scnprintf(buf, size, "%s", emsg); + errno = err; + scnprintf(buf, size, "%m"); break; } @@ -1920,8 +1923,8 @@ static int evlist__parse_control_fifo(const char *str, int *ctl_fd, int *ctl_fd_ */ fd = open(s, O_RDWR | O_NONBLOCK | O_CLOEXEC); if (fd < 0) { - pr_err("Failed to open '%s'\n", s); ret = -errno; + pr_err("Failed to open '%s': %m\n", s); goto out_free; } *ctl_fd = fd; @@ -1931,7 +1934,7 @@ static int evlist__parse_control_fifo(const char *str, int *ctl_fd, int *ctl_fd_ /* O_RDWR | O_NONBLOCK means the other end need not be open */ fd = open(p, O_RDWR | O_NONBLOCK | O_CLOEXEC); if (fd < 0) { - pr_err("Failed to open '%s'\n", p); + pr_err("Failed to open '%s': %m\n", p); ret = -errno; goto out_free; } @@ -2364,7 +2367,7 @@ int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *o eet->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); if (eet->timerfd == -1) { err = -errno; - pr_err("timerfd_create failed: %s\n", strerror(errno)); + pr_err("timerfd_create failed: %m\n"); goto free_eet_times; } @@ -2399,7 +2402,7 @@ static int event_enable_timer__set_timer(struct event_enable_timer *eet, int ms) if (timerfd_settime(eet->timerfd, 0, &its, NULL) < 0) { err = -errno; - pr_err("timerfd_settime failed: %s\n", strerror(errno)); + pr_err("timerfd_settime failed: %m\n"); } return err; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ec6552a6f667..e2de642fbf53 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -648,8 +648,9 @@ struct tep_event *evsel__tp_format(struct evsel *evsel) if (IS_ERR(tp_format)) { int err = -PTR_ERR(evsel->tp_format); - pr_err("Error getting tracepoint format '%s' '%s'(%d)\n", - evsel__name(evsel), strerror(err), err); + errno = err; + pr_err("Error getting tracepoint format '%s': %m\n", + evsel__name(evsel)); return NULL; } evsel->tp_format = tp_format; @@ -2772,8 +2773,8 @@ retry_open: PERF_EVENT_IOC_SET_BPF, bpf_fd); if (err && errno != EEXIST) { - pr_err("failed to attach bpf fd %d: %s\n", - bpf_fd, strerror(errno)); + pr_err("failed to attach bpf fd %d: %m\n", + bpf_fd); err = -EINVAL; goto out_close; } @@ -3864,7 +3865,6 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, int err, char *msg, size_t size) { struct perf_pmu *pmu; - char sbuf[STRERR_BUFSIZE]; int printed = 0, enforced = 0; int ret; @@ -3997,10 +3997,11 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, if (ret) return ret; + errno = err; return scnprintf(msg, size, - "The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n" - "\"dmesg | grep -i perf\" may provide additional information.\n", - err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel)); + "The sys_perf_event_open() syscall failed for event (%s): %m\n" + "\"dmesg | grep -i perf\" may provide additional information.\n", + evsel__name(evsel)); } struct perf_session *evsel__session(struct evsel *evsel) diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index f00814e37de9..d4fe35f9d9a5 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -90,7 +90,8 @@ jit_emit_elf(struct jit_buf_desc *jd, saved_errno = errno; nsinfo__mountns_exit(&nsc); if (fd == -1) { - pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(saved_errno)); + errno = saved_errno; + pr_warning("cannot create jit ELF %s: %m\n", filename); return -1; } diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c index c355757ed391..91b9b5171d1f 100644 --- a/tools/perf/util/lzma.c +++ b/tools/perf/util/lzma.c @@ -59,7 +59,7 @@ int lzma_decompress_stream_to_file(FILE *infile, int output_fd) strm.avail_in = fread(buf_in, 1, sizeof(buf_in), infile); if (ferror(infile)) { - pr_debug("lzma: read error: %s\n", strerror(errno)); + pr_debug("lzma: read error: %m\n"); goto err_lzma_end; } @@ -73,7 +73,7 @@ int lzma_decompress_stream_to_file(FILE *infile, int output_fd) ssize_t write_size = sizeof(buf_out) - strm.avail_out; if (writen(output_fd, buf_out, write_size) != write_size) { - pr_debug("lzma: write error: %s\n", strerror(errno)); + pr_debug("lzma: write error: %m\n"); goto err_lzma_end; } @@ -103,7 +103,7 @@ int lzma_decompress_to_file(const char *input, int output_fd) infile = fopen(input, "rb"); if (!infile) { - pr_debug("lzma: fopen failed on %s: '%s'\n", input, strerror(errno)); + pr_debug("lzma: fopen failed on %s: '%m'\n", input); return -1; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 65fa9bdff1b8..922ef6577bbb 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2349,9 +2349,10 @@ reader__read_event(struct reader *rd, struct perf_session *session, if (size < sizeof(struct perf_event_header) || (skip = rd->process(session, event, rd->file_pos, rd->path)) < 0) { - pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%s]\n", + errno = -skip; + pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%m]\n", rd->file_offset + rd->head, event->header.size, - event->header.type, strerror(-skip)); + event->header.type); err = skip; goto out; } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index d1dcafa4b3b8..b8fea12997a0 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1105,14 +1105,14 @@ static Elf *read_gnu_debugdata(struct dso *dso, Elf *elf, const char *name, int wrapped = fmemopen(scn_data->d_buf, scn_data->d_size, "r"); if (!wrapped) { - pr_debug("%s: fmemopen: %s\n", __func__, strerror(errno)); + pr_debug("%s: fmemopen: %m\n", __func__); *dso__load_errno(dso) = -errno; return NULL; } temp_fd = mkstemp(temp_filename); if (temp_fd < 0) { - pr_debug("%s: mkstemp: %s\n", __func__, strerror(errno)); + pr_debug("%s: mkstemp: %m\n", __func__); *dso__load_errno(dso) = -errno; fclose(wrapped); return NULL; -- cgit v1.2.3 From 55058e32151f95dc5badd62381d184e89f15de99 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:20 +0000 Subject: KVM: selftests: Add a selftests for nested VMLOAD/VMSAVE Add a test for VMLOAD/VMSAVE in an L2 guest. The test verifies that L1 intercepts for VMSAVE/VMLOAD always work regardless of VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK. Then, more interestingly, it makes sure that when L1 does not intercept VMLOAD/VMSAVE, they work as intended in L2. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is enabled by L1, VMSAVE/VMLOAD from L2 should interpret the GPA as an L2 GPA and translate it through the NPT. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is disabled by L1, VMSAVE/VMLOAD from L2 should interpret the GPA as an L1 GPA. To test this, put two VMCBs (0 and 1) in L1's physical address space, and have a single L2 GPA where: - L2 VMCB GPA == L1 VMCB(0) GPA - L2 VMCB GPA maps to L1 VMCB(1) via the NPT in L1. This setup allows detecting how the GPA is interpreted based on which L1 VMCB is actually accessed. In both cases, L2 sets KERNEL_GS_BASE (one of the fields handled by VMSAVE/VMLOAD), and executes VMSAVE to write its value to the VMCB. The test userspace code then checks that the write was made to the correct VMCB (based on whether VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is set by L1), and writes a new value to that VMCB. L2 then executes VMLOAD to load the new value and makes sure it's reflected correctly in KERNERL_GS_BASE. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/include/x86/processor.h | 1 + .../selftests/kvm/x86/nested_vmsave_vmload_test.c | 197 +++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ffbf891b31d3..fc78d4508ebd 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -95,6 +95,7 @@ TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test +TEST_GEN_PROGS_x86 += x86/nested_vmsave_vmload_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 8f130e7d7048..6bfffc3b0a33 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -201,6 +201,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_TSCRATEMSR KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4) #define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) #define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) +#define X86_FEATURE_V_VMSAVE_VMLOAD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 15) #define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) #define X86_FEATURE_IDLE_HLT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30) #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c new file mode 100644 index 000000000000..6764a48f9d4d --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + +/* + * Allocate two VMCB pages for testing. Both pages have different GVAs (shared + * by both L1 and L2) and L1 GPAs. A single L2 GPA is used such that: + * - L2 GPA == L1 GPA for VMCB0. + * - L2 GPA is mapped to L1 GPA for VMCB1 using NPT in L1. + * + * This allows testing whether the GPA used by VMSAVE/VMLOAD in L2 is + * interpreted as a direct L1 GPA or translated using NPT as an L2 GPA, depends + * on which VMCB is accessed. + */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_PAGES 2 +#define TEST_MEM_BASE 0xc0000000 + +#define TEST_GUEST_ADDR(idx) (TEST_MEM_BASE + (idx) * PAGE_SIZE) + +#define TEST_VMCB_L1_GPA(idx) TEST_GUEST_ADDR(idx) +#define TEST_VMCB_GVA(idx) TEST_GUEST_ADDR(idx) + +#define TEST_VMCB_L2_GPA TEST_VMCB_L1_GPA(0) + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code_vmsave(void) +{ + asm volatile("vmsave %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); +} + +static void l2_guest_code_vmload(void) +{ + asm volatile("vmload %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); +} + +static void l2_guest_code_vmcb(int vmcb_idx) +{ + wrmsr(MSR_KERNEL_GS_BASE, 0xaaaa); + l2_guest_code_vmsave(); + + /* Verify the VMCB used by VMSAVE and update KERNEL_GS_BASE to 0xbbbb */ + GUEST_SYNC(vmcb_idx); + + l2_guest_code_vmload(); + GUEST_ASSERT_EQ(rdmsr(MSR_KERNEL_GS_BASE), 0xbbbb); + + /* Reset MSR_KERNEL_GS_BASE */ + wrmsr(MSR_KERNEL_GS_BASE, 0); + l2_guest_code_vmsave(); + + vmmcall(); +} + +static void l2_guest_code_vmcb0(void) +{ + l2_guest_code_vmcb(0); +} + +static void l2_guest_code_vmcb1(void) +{ + l2_guest_code_vmcb(1); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Each test case initializes the guest RIP below */ + generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Set VMSAVE/VMLOAD intercepts and make sure they work with.. */ + svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) | + BIT_ULL(INTERCEPT_VMLOAD)); + + /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */ + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE); + + svm->vmcb->save.rip = (u64)l2_guest_code_vmload; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); + + /* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */ + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE); + + svm->vmcb->save.rip = (u64)l2_guest_code_vmload; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); + + /* Now clear the intercepts to test VMSAVE/VMLOAD behavior */ + svm->vmcb->control.intercept &= ~(BIT_ULL(INTERCEPT_VMSAVE) | + BIT_ULL(INTERCEPT_VMLOAD)); + + /* + * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be + * interpreted as an L1 GPA, so VMCB0 should be used. + */ + svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0; + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + /* + * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as + * an L2 GPA, and translated through the NPT to VMCB1. + */ + svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1; + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + struct vmcb *test_vmcb[2]; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_NPT)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_tdp(vm); + + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + TEST_MEM_BASE, TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, 0); + + for (i = 0; i <= 1; i++) { + virt_map(vm, TEST_VMCB_GVA(i), TEST_VMCB_L1_GPA(i), 1); + test_vmcb[i] = (struct vmcb *)addr_gva2hva(vm, TEST_VMCB_GVA(i)); + } + + tdp_identity_map_default_memslots(vm); + + /* + * L2 GPA == L1_GPA(0), but map it to L1_GPA(1), to allow testing + * whether the L2 GPA is interpreted as an L1 GPA or translated through + * the NPT. + */ + TEST_ASSERT_EQ(TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(0)); + tdp_map(vm, TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(1), PAGE_SIZE); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + i = uc.args[1]; + TEST_ASSERT(i == 0 || i == 1, "Unexpected VMCB idx: %d", i); + + /* + * Check that only the expected VMCB has KERNEL_GS_BASE + * set to 0xaaaa, and update it to 0xbbbb. + */ + TEST_ASSERT_EQ(test_vmcb[i]->save.kernel_gs_base, 0xaaaa); + TEST_ASSERT_EQ(test_vmcb[1-i]->save.kernel_gs_base, 0); + test_vmcb[i]->save.kernel_gs_base = 0xbbbb; + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From 159ca97cd97ce8cc65364fee37319823b5ffb5bd Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:13 +0000 Subject: perf parse-events: Refactor get_config_terms() to remove macros The ADD_CONFIG_TERM() macros build the __type argument out of a partial EVSEL__CONFIG_TERM_x enum name. This means that they can't be called from a function where __type is a variable and it's also impossible to grep the codebase to find usages of these enums as they're never typed in full. Fix this by removing the macros and replacing them with an add_config_term() function. It seems the main reason these existed in the first place was to avoid type punning and to write to a specific field in the union, but the same thing can be achieved with a single write to a u64 'val' field. Running the Perf tests with "-fsanitize=undefined -fno-sanitize-recover" results in no new issues as a result of this change. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel_config.h | 1 + tools/perf/util/parse-events.c | 146 ++++++++++++++++++++++++----------------- 2 files changed, 86 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index bcd3a978f0c4..685fd8d5c4a8 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -50,6 +50,7 @@ struct evsel_config_term { u64 cfg_chg; char *str; int cpu; + u64 val; } val; bool weak; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 17c1c36a7bf9..46422286380f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1116,105 +1116,107 @@ static int config_attr(struct perf_event_attr *attr, return 0; } -static int get_config_terms(const struct parse_events_terms *head_config, - struct list_head *head_terms) +static struct evsel_config_term *add_config_term(enum evsel_term_type type, + struct list_head *head_terms, + bool weak) { -#define ADD_CONFIG_TERM(__type, __weak) \ - struct evsel_config_term *__t; \ - \ - __t = zalloc(sizeof(*__t)); \ - if (!__t) \ - return -ENOMEM; \ - \ - INIT_LIST_HEAD(&__t->list); \ - __t->type = EVSEL__CONFIG_TERM_ ## __type; \ - __t->weak = __weak; \ - list_add_tail(&__t->list, head_terms) - -#define ADD_CONFIG_TERM_VAL(__type, __name, __val, __weak) \ -do { \ - ADD_CONFIG_TERM(__type, __weak); \ - __t->val.__name = __val; \ -} while (0) + struct evsel_config_term *t; -#define ADD_CONFIG_TERM_STR(__type, __val, __weak) \ -do { \ - ADD_CONFIG_TERM(__type, __weak); \ - __t->val.str = strdup(__val); \ - if (!__t->val.str) { \ - zfree(&__t); \ - return -ENOMEM; \ - } \ - __t->free_str = true; \ -} while (0) + t = zalloc(sizeof(*t)); + if (!t) + return NULL; + + INIT_LIST_HEAD(&t->list); + t->type = type; + t->weak = weak; + list_add_tail(&t->list, head_terms); + return t; +} + +static int get_config_terms(const struct parse_events_terms *head_config, + struct list_head *head_terms) +{ struct parse_events_term *term; list_for_each_entry(term, &head_config->terms, list) { + struct evsel_config_term *new_term; + enum evsel_term_type new_type; + bool str_type = false; + u64 val; + switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: - ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_PERIOD; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: - ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_FREQ; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_TIME: - ADD_CONFIG_TERM_VAL(TIME, time, term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_TIME; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: - ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str, term->weak); + new_type = EVSEL__CONFIG_TERM_CALLGRAPH; + str_type = true; break; case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: - ADD_CONFIG_TERM_STR(BRANCH, term->val.str, term->weak); + new_type = EVSEL__CONFIG_TERM_BRANCH; + str_type = true; break; case PARSE_EVENTS__TERM_TYPE_STACKSIZE: - ADD_CONFIG_TERM_VAL(STACK_USER, stack_user, - term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_STACK_USER; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_INHERIT: - ADD_CONFIG_TERM_VAL(INHERIT, inherit, - term->val.num ? 1 : 0, term->weak); + new_type = EVSEL__CONFIG_TERM_INHERIT; + val = term->val.num ? 1 : 0; break; case PARSE_EVENTS__TERM_TYPE_NOINHERIT: - ADD_CONFIG_TERM_VAL(INHERIT, inherit, - term->val.num ? 0 : 1, term->weak); + new_type = EVSEL__CONFIG_TERM_INHERIT; + val = term->val.num ? 0 : 1; break; case PARSE_EVENTS__TERM_TYPE_MAX_STACK: - ADD_CONFIG_TERM_VAL(MAX_STACK, max_stack, - term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_MAX_STACK; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: - ADD_CONFIG_TERM_VAL(MAX_EVENTS, max_events, - term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_MAX_EVENTS; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_OVERWRITE: - ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, - term->val.num ? 1 : 0, term->weak); + new_type = EVSEL__CONFIG_TERM_OVERWRITE; + val = term->val.num ? 1 : 0; break; case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: - ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, - term->val.num ? 0 : 1, term->weak); + new_type = EVSEL__CONFIG_TERM_OVERWRITE; + val = term->val.num ? 0 : 1; break; case PARSE_EVENTS__TERM_TYPE_DRV_CFG: - ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str, term->weak); + new_type = EVSEL__CONFIG_TERM_DRV_CFG; + str_type = true; break; case PARSE_EVENTS__TERM_TYPE_PERCORE: - ADD_CONFIG_TERM_VAL(PERCORE, percore, - term->val.num ? true : false, term->weak); + new_type = EVSEL__CONFIG_TERM_PERCORE; + val = term->val.num ? true : false; break; case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT: - ADD_CONFIG_TERM_VAL(AUX_OUTPUT, aux_output, - term->val.num ? 1 : 0, term->weak); + new_type = EVSEL__CONFIG_TERM_AUX_OUTPUT; + val = term->val.num ? 1 : 0; break; case PARSE_EVENTS__TERM_TYPE_AUX_ACTION: - ADD_CONFIG_TERM_STR(AUX_ACTION, term->val.str, term->weak); + new_type = EVSEL__CONFIG_TERM_AUX_ACTION; + str_type = true; break; case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: - ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size, - term->val.num, term->weak); + new_type = EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE; + val = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: - ADD_CONFIG_TERM_STR(RATIO_TO_PREV, term->val.str, term->weak); + new_type = EVSEL__CONFIG_TERM_RATIO_TO_PREV; + str_type = true; break; case PARSE_EVENTS__TERM_TYPE_USER: case PARSE_EVENTS__TERM_TYPE_CONFIG: @@ -1229,7 +1231,23 @@ do { \ case PARSE_EVENTS__TERM_TYPE_RAW: case PARSE_EVENTS__TERM_TYPE_CPU: default: - break; + /* Don't add a new term for these ones */ + continue; + } + + new_term = add_config_term(new_type, head_terms, term->weak); + if (!new_term) + return -ENOMEM; + + if (str_type) { + new_term->val.str = strdup(term->val.str); + if (!new_term->val.str) { + zfree(&new_term); + return -ENOMEM; + } + new_term->free_str = true; + } else { + new_term->val.val = val; } } return 0; @@ -1290,10 +1308,16 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head } } - if (bits) - ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits, false); + if (bits) { + struct evsel_config_term *new_term; + + new_term = add_config_term(EVSEL__CONFIG_TERM_CFG_CHG, + head_terms, false); + if (!new_term) + return -ENOMEM; + new_term->val.cfg_chg = bits; + } -#undef ADD_CONFIG_TERM return 0; } -- cgit v1.2.3 From 4563e23bd9e4057d6d22ae6631f9dee781fd22bd Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:14 +0000 Subject: perf evsel: Refactor evsel__set_config_if_unset() arguments Make the evsel argument first to match the other evsel__* functions and remove the redundant pmu argument, which can be accessed via evsel. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 9 +++------ tools/perf/arch/arm64/util/arm-spe.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 3 +-- tools/perf/util/evsel.h | 4 ++-- tools/perf/util/pmu.c | 6 +++--- 5 files changed, 10 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index ea891d12f8f4..c28208361d91 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -441,10 +441,8 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * when a context switch happened. */ if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { - evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, - "timestamp", 1); - evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, - "contextid", 1); + evsel__set_config_if_unset(cs_etm_evsel, "timestamp", 1); + evsel__set_config_if_unset(cs_etm_evsel, "contextid", 1); } /* @@ -453,8 +451,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * timestamp tracing. */ if (opts->sample_time_set) - evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, - "timestamp", 1); + evsel__set_config_if_unset(cs_etm_evsel, "timestamp", 1); /* Add dummy event to keep tracking */ err = parse_event(evlist, "dummy:u"); diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index d5ec1408d0ae..51014f8bff97 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -274,7 +274,7 @@ static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus) */ if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(evsel, CPU); - evsel__set_config_if_unset(evsel->pmu, evsel, "ts_enable", 1); + evsel__set_config_if_unset(evsel, "ts_enable", 1); } /* diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index b394ad9cc635..c131a727774f 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -664,8 +664,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, return 0; if (opts->auxtrace_sample_mode) - evsel__set_config_if_unset(intel_pt_pmu, intel_pt_evsel, - "psb_period", 0); + evsel__set_config_if_unset(intel_pt_evsel, "psb_period", 0); err = intel_pt_validate_config(intel_pt_pmu, intel_pt_evsel); if (err) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a08130ff2e47..2cf87bc67df7 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -575,8 +575,8 @@ void evsel__uniquify_counter(struct evsel *counter); ((((src) >> (pos)) & ((1ull << (size)) - 1)) << (63 - ((pos) + (size) - 1))) u64 evsel__bitfield_swap_branch_flags(u64 value); -void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel, - const char *config_name, u64 val); +void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, + u64 val); bool evsel__is_offcpu_event(struct evsel *evsel); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 956ea273c2c7..e87c12946d71 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1382,8 +1382,8 @@ bool evsel__is_aux_event(const struct evsel *evsel) * something to true, pass 1 for val rather than a pre shifted value. */ #define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask)) -void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel, - const char *config_name, u64 val) +void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, + u64 val) { u64 user_bits = 0, bits; struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG); @@ -1391,7 +1391,7 @@ void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel, if (term) user_bits = term->val.cfg_chg; - bits = perf_pmu__format_bits(pmu, config_name); + bits = perf_pmu__format_bits(evsel->pmu, config_name); /* Do nothing if the user changed the value */ if (bits & user_bits) -- cgit v1.2.3 From 11ac46060512f6ef1caedabf9a1a129157d0e8a5 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:15 +0000 Subject: perf evsel: Move evsel__* functions to evsel.c At least one of these were put here to avoid a Python binding linking issue which is no longer present. Put them back in their correct location to avoid confusion about which file to add a new evsel__* function to later. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/all/ZEbAS2yx2fguW60w@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 40 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu.c | 40 ---------------------------------------- 2 files changed, 40 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index e2de642fbf53..27bdef01beaf 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1315,6 +1315,35 @@ struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evs return found_term; } +/* + * Set @config_name to @val as long as the user hasn't already set or cleared it + * by passing a config term on the command line. + * + * @val is the value to put into the bits specified by @config_name rather than + * the bit pattern. It is shifted into position by this function, so to set + * something to true, pass 1 for val rather than a pre shifted value. + */ +#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask)) +void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, + u64 val) +{ + u64 user_bits = 0, bits; + struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG); + + if (term) + user_bits = term->val.cfg_chg; + + bits = perf_pmu__format_bits(evsel->pmu, config_name); + + /* Do nothing if the user changed the value */ + if (bits & user_bits) + return; + + /* Otherwise replace it */ + evsel->core.attr.config &= ~bits; + evsel->core.attr.config |= field_prep(bits, val); +} + void __weak arch_evsel__set_sample_weight(struct evsel *evsel) { evsel__set_sample_bit(evsel, WEIGHT); @@ -4099,6 +4128,17 @@ void evsel__set_leader(struct evsel *evsel, struct evsel *leader) evsel->core.leader = &leader->core; } +bool evsel__is_aux_event(const struct evsel *evsel) +{ + struct perf_pmu *pmu; + + if (evsel->needs_auxtrace_mmap) + return true; + + pmu = evsel__find_pmu(evsel); + return pmu && pmu->auxtrace; +} + int evsel__source_count(const struct evsel *evsel) { struct evsel *pos; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index e87c12946d71..e3a1f26213ec 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1362,46 +1362,6 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) } } -bool evsel__is_aux_event(const struct evsel *evsel) -{ - struct perf_pmu *pmu; - - if (evsel->needs_auxtrace_mmap) - return true; - - pmu = evsel__find_pmu(evsel); - return pmu && pmu->auxtrace; -} - -/* - * Set @config_name to @val as long as the user hasn't already set or cleared it - * by passing a config term on the command line. - * - * @val is the value to put into the bits specified by @config_name rather than - * the bit pattern. It is shifted into position by this function, so to set - * something to true, pass 1 for val rather than a pre shifted value. - */ -#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask)) -void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, - u64 val) -{ - u64 user_bits = 0, bits; - struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG); - - if (term) - user_bits = term->val.cfg_chg; - - bits = perf_pmu__format_bits(evsel->pmu, config_name); - - /* Do nothing if the user changed the value */ - if (bits & user_bits) - return; - - /* Otherwise replace it */ - evsel->core.attr.config &= ~bits; - evsel->core.attr.config |= field_prep(bits, val); -} - static struct perf_pmu_format * pmu_find_format(const struct list_head *formats, const char *name) { -- cgit v1.2.3 From 5b5e01304f13a53daec000b28ba60e51b149cdf4 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:16 +0000 Subject: perf evsel: Support sparse fields in evsel__set_config_if_unset() Sparse config fields are technically supported although currently unused. field_prep() only works for contiguous bitfields so replace it with pmu_format_value(). pmu_format_value() also takes a bitmap rather than a u64 so replace 'u64 bits' with format->bits. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 19 +++++++++++-------- tools/perf/util/pmu.c | 32 +++----------------------------- tools/perf/util/pmu.h | 28 ++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 27bdef01beaf..9a9f5e5a64b9 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1323,25 +1323,28 @@ struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evs * the bit pattern. It is shifted into position by this function, so to set * something to true, pass 1 for val rather than a pre shifted value. */ -#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask)) void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, u64 val) { - u64 user_bits = 0, bits; + u64 user_bits = 0; struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG); + struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format, + config_name); + int fbit; + + if (!format) + return; if (term) user_bits = term->val.cfg_chg; - bits = perf_pmu__format_bits(evsel->pmu, config_name); - /* Do nothing if the user changed the value */ - if (bits & user_bits) - return; + for_each_set_bit(fbit, format->bits, PERF_PMU_FORMAT_BITS) + if ((1ULL << fbit) & user_bits) + return; /* Otherwise replace it */ - evsel->core.attr.config &= ~bits; - evsel->core.attr.config |= field_prep(bits, val); + pmu_format_value(format->bits, val, &evsel->core.attr.config, /*zero=*/true); } void __weak arch_evsel__set_sample_weight(struct evsel *evsel) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index e3a1f26213ec..7967d9159742 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -118,31 +118,6 @@ struct perf_pmu_alias { bool info_loaded; }; -/** - * struct perf_pmu_format - Values from a format file read from - * /devices/cpu/format/ held in struct perf_pmu. - * - * For example, the contents of /devices/cpu/format/event may be - * "config:0-7" and will be represented here as name="event", - * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set. - */ -struct perf_pmu_format { - /** @list: Element on list within struct perf_pmu. */ - struct list_head list; - /** @bits: Which config bits are set by this format value. */ - DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS); - /** @name: The modifier/file name. */ - char *name; - /** - * @value : Which config value the format relates to. Supported values - * are from PERF_PMU_FORMAT_VALUE_CONFIG to - * PERF_PMU_FORMAT_VALUE_CONFIG_END. - */ - u16 value; - /** @loaded: Has the contents been loaded/parsed. */ - bool loaded; -}; - static int pmu_aliases_parse(struct perf_pmu *pmu); static struct perf_pmu_format *perf_pmu__new_format(struct list_head *list, char *name) @@ -1362,8 +1337,8 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) } } -static struct perf_pmu_format * -pmu_find_format(const struct list_head *formats, const char *name) +struct perf_pmu_format *pmu_find_format(const struct list_head *formats, + const char *name) { struct perf_pmu_format *format; @@ -1404,8 +1379,7 @@ int perf_pmu__format_type(struct perf_pmu *pmu, const char *name) * Sets value based on the format definition (format parameter) * and unformatted value (value parameter). */ -static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, - bool zero) +void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero) { unsigned long fbit, vbit; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 8f11bfe8ed6d..3a53e1882cf1 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -233,6 +233,31 @@ struct pmu_event_info { bool deprecated; }; +/** + * struct perf_pmu_format - Values from a format file read from + * /devices/cpu/format/ held in struct perf_pmu. + * + * For example, the contents of /devices/cpu/format/event may be + * "config:0-7" and will be represented here as name="event", + * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set. + */ +struct perf_pmu_format { + /** @list: Element on list within struct perf_pmu. */ + struct list_head list; + /** @bits: Which config bits are set by this format value. */ + DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS); + /** @name: The modifier/file name. */ + char *name; + /** + * @value : Which config value the format relates to. Supported values + * are from PERF_PMU_FORMAT_VALUE_CONFIG to + * PERF_PMU_FORMAT_VALUE_CONFIG_END. + */ + u16 value; + /** @loaded: Has the contents been loaded/parsed. */ + bool loaded; +}; + typedef int (*pmu_event_callback)(void *state, struct pmu_event_info *info); typedef int (*pmu_format_callback)(void *state, const char *name, int config, const unsigned long *bits); @@ -254,6 +279,9 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_ u64 *alternate_hw_config, struct parse_events_error *err); int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb); +void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero); +struct perf_pmu_format *pmu_find_format(const struct list_head *formats, + const char *name); void perf_pmu_format__set_value(void *format, int config, unsigned long *bits); bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name); int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb); -- cgit v1.2.3 From a2441cf3a5930370aa02d14f2c90fcc4c2ba26f7 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:17 +0000 Subject: perf parse-events: Track all user changed config bits Currently we only track which bits were set by the user in attr->config. But all configN fields should be treated equally as they can all have default and user overridden values. Track them all by making get_config_chgs() generic and calling it once for each config value. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 9 +++- tools/perf/util/evsel_config.h | 6 ++- tools/perf/util/parse-events.c | 98 +++++++++++++++++++++++------------------- tools/perf/util/pmu.c | 4 +- tools/perf/util/pmu.h | 4 +- 5 files changed, 70 insertions(+), 51 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9a9f5e5a64b9..1a41a10d9ab2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1243,7 +1243,11 @@ static void evsel__apply_config_terms(struct evsel *evsel, case EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE: /* Already applied by auxtrace */ break; - case EVSEL__CONFIG_TERM_CFG_CHG: + case EVSEL__CONFIG_TERM_USR_CHG_CONFIG: + case EVSEL__CONFIG_TERM_USR_CHG_CONFIG1: + case EVSEL__CONFIG_TERM_USR_CHG_CONFIG2: + case EVSEL__CONFIG_TERM_USR_CHG_CONFIG3: + case EVSEL__CONFIG_TERM_USR_CHG_CONFIG4: break; case EVSEL__CONFIG_TERM_RATIO_TO_PREV: rtp_buf = term->val.str; @@ -1327,7 +1331,8 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, u64 val) { u64 user_bits = 0; - struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG); + struct evsel_config_term *term = evsel__get_config_term(evsel, + USR_CHG_CONFIG); struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format, config_name); int fbit; diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index 685fd8d5c4a8..7b565d76c0bc 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -27,7 +27,11 @@ enum evsel_term_type { EVSEL__CONFIG_TERM_AUX_OUTPUT, EVSEL__CONFIG_TERM_AUX_ACTION, EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE, - EVSEL__CONFIG_TERM_CFG_CHG, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG1, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG2, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG3, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG4, EVSEL__CONFIG_TERM_RATIO_TO_PREV, }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 46422286380f..1f6e2213326d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1253,66 +1253,32 @@ static int get_config_terms(const struct parse_events_terms *head_config, return 0; } -/* - * Add EVSEL__CONFIG_TERM_CFG_CHG where cfg_chg will have a bit set for - * each bit of attr->config that the user has changed. - */ -static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head_config, - struct list_head *head_terms) +static int add_cfg_chg(const struct perf_pmu *pmu, + const struct parse_events_terms *head_config, + struct list_head *head_terms, + int format_type, + enum parse_events__term_type term_type, + enum evsel_term_type new_term_type) { struct parse_events_term *term; u64 bits = 0; int type; list_for_each_entry(term, &head_config->terms, list) { - switch (term->type_term) { - case PARSE_EVENTS__TERM_TYPE_USER: + if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) { type = perf_pmu__format_type(pmu, term->config); - if (type != PERF_PMU_FORMAT_VALUE_CONFIG) + if (type != format_type) continue; bits |= perf_pmu__format_bits(pmu, term->config); - break; - case PARSE_EVENTS__TERM_TYPE_CONFIG: + } else if (term->type_term == term_type) { bits = ~(u64)0; - break; - case PARSE_EVENTS__TERM_TYPE_CONFIG1: - case PARSE_EVENTS__TERM_TYPE_CONFIG2: - case PARSE_EVENTS__TERM_TYPE_CONFIG3: - case PARSE_EVENTS__TERM_TYPE_CONFIG4: - case PARSE_EVENTS__TERM_TYPE_LEGACY_HARDWARE_CONFIG: - case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE_CONFIG: - case PARSE_EVENTS__TERM_TYPE_NAME: - case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: - case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: - case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: - case PARSE_EVENTS__TERM_TYPE_TIME: - case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: - case PARSE_EVENTS__TERM_TYPE_STACKSIZE: - case PARSE_EVENTS__TERM_TYPE_NOINHERIT: - case PARSE_EVENTS__TERM_TYPE_INHERIT: - case PARSE_EVENTS__TERM_TYPE_MAX_STACK: - case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: - case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: - case PARSE_EVENTS__TERM_TYPE_OVERWRITE: - case PARSE_EVENTS__TERM_TYPE_DRV_CFG: - case PARSE_EVENTS__TERM_TYPE_PERCORE: - case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT: - case PARSE_EVENTS__TERM_TYPE_AUX_ACTION: - case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: - case PARSE_EVENTS__TERM_TYPE_METRIC_ID: - case PARSE_EVENTS__TERM_TYPE_RAW: - case PARSE_EVENTS__TERM_TYPE_CPU: - case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: - default: - break; } } if (bits) { struct evsel_config_term *new_term; - new_term = add_config_term(EVSEL__CONFIG_TERM_CFG_CHG, - head_terms, false); + new_term = add_config_term(new_term_type, head_terms, false); if (!new_term) return -ENOMEM; new_term->val.cfg_chg = bits; @@ -1321,6 +1287,50 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head return 0; } +/* + * Add EVSEL__CONFIG_TERM_USR_CFG_CONFIGn where cfg_chg will have a bit set for + * each bit of attr->configN that the user has changed. + */ +static int get_config_chgs(const struct perf_pmu *pmu, + const struct parse_events_terms *head_config, + struct list_head *head_terms) +{ + int ret; + + ret = add_cfg_chg(pmu, head_config, head_terms, + PERF_PMU_FORMAT_VALUE_CONFIG, + PARSE_EVENTS__TERM_TYPE_CONFIG, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG); + if (ret) + return ret; + + ret = add_cfg_chg(pmu, head_config, head_terms, + PERF_PMU_FORMAT_VALUE_CONFIG1, + PARSE_EVENTS__TERM_TYPE_CONFIG1, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG1); + if (ret) + return ret; + + ret = add_cfg_chg(pmu, head_config, head_terms, + PERF_PMU_FORMAT_VALUE_CONFIG2, + PARSE_EVENTS__TERM_TYPE_CONFIG2, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG2); + if (ret) + return ret; + + ret = add_cfg_chg(pmu, head_config, head_terms, + PERF_PMU_FORMAT_VALUE_CONFIG3, + PARSE_EVENTS__TERM_TYPE_CONFIG3, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG3); + if (ret) + return ret; + + return add_cfg_chg(pmu, head_config, head_terms, + PERF_PMU_FORMAT_VALUE_CONFIG4, + PARSE_EVENTS__TERM_TYPE_CONFIG4, + EVSEL__CONFIG_TERM_USR_CHG_CONFIG4); +} + int parse_events_add_tracepoint(struct parse_events_state *parse_state, struct list_head *list, const char *sys, const char *event, diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7967d9159742..dc5dab69151f 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1349,7 +1349,7 @@ struct perf_pmu_format *pmu_find_format(const struct list_head *formats, return NULL; } -__u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name) +__u64 perf_pmu__format_bits(const struct perf_pmu *pmu, const char *name) { struct perf_pmu_format *format = pmu_find_format(&pmu->format, name); __u64 bits = 0; @@ -1364,7 +1364,7 @@ __u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name) return bits; } -int perf_pmu__format_type(struct perf_pmu *pmu, const char *name) +int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name) { struct perf_pmu_format *format = pmu_find_format(&pmu->format, name); diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 3a53e1882cf1..7655d996090a 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -272,8 +272,8 @@ int perf_pmu__config_terms(const struct perf_pmu *pmu, struct parse_events_terms *terms, bool zero, bool apply_hardcoded, struct parse_events_error *error); -__u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name); -int perf_pmu__format_type(struct perf_pmu *pmu, const char *name); +__u64 perf_pmu__format_bits(const struct perf_pmu *pmu, const char *name); +int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name); int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_terms, struct perf_pmu_info *info, bool *rewrote_terms, u64 *alternate_hw_config, struct parse_events_error *err); -- cgit v1.2.3 From 87775abac8733f5a4856cd59122c6dc8c8032a13 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:18 +0000 Subject: perf evsel: apply evsel__set_config_if_unset() to all config fields Misleadingly, evsel__set_config_if_unset() only works with the config field and not config1, config2, etc. This is fine at the moment because all users of it happen to operate on bits that are in that config field. Fix it before there are any new users of the function which operate on bits in different config fields. In theory it's also possible for a driver to move an existing bit to another config field and this fixes that scenario too, although this hasn't happened yet either. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1a41a10d9ab2..32517683351f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1336,6 +1336,36 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format, config_name); int fbit; + __u64 *vp; + + if (!format) + return; + + switch (format->value) { + case PERF_PMU_FORMAT_VALUE_CONFIG: + term = evsel__get_config_term(evsel, USR_CHG_CONFIG); + vp = &evsel->core.attr.config; + break; + case PERF_PMU_FORMAT_VALUE_CONFIG1: + term = evsel__get_config_term(evsel, USR_CHG_CONFIG1); + vp = &evsel->core.attr.config1; + break; + case PERF_PMU_FORMAT_VALUE_CONFIG2: + term = evsel__get_config_term(evsel, USR_CHG_CONFIG2); + vp = &evsel->core.attr.config2; + break; + case PERF_PMU_FORMAT_VALUE_CONFIG3: + term = evsel__get_config_term(evsel, USR_CHG_CONFIG3); + vp = &evsel->core.attr.config3; + break; + case PERF_PMU_FORMAT_VALUE_CONFIG4: + term = evsel__get_config_term(evsel, USR_CHG_CONFIG4); + vp = &evsel->core.attr.config4; + break; + default: + pr_err("Unknown format value: %d\n", format->value); + return; + } if (!format) return; @@ -1349,7 +1379,7 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, return; /* Otherwise replace it */ - pmu_format_value(format->bits, val, &evsel->core.attr.config, /*zero=*/true); + pmu_format_value(format->bits, val, vp, /*zero=*/true); } void __weak arch_evsel__set_sample_weight(struct evsel *evsel) -- cgit v1.2.3 From 34b4cfbe5cb03328a3330ea47dd8df00715dd627 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:19 +0000 Subject: perf evsel: Add a helper to get the value of a config field This will be used by aux PMUs to read an already written value for configuring their events and for also testing. Its helper perf_pmu__format_unpack() does the opposite of the existing pmu_format_value() so rename that one to perf_pmu__format_pack() so it's clear how they are related. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 42 +++++++++++++++++++++++++++++++++++++++++- tools/perf/util/evsel.h | 2 ++ tools/perf/util/pmu.c | 35 ++++++++++++++++++++++++++++------- tools/perf/util/pmu.h | 4 +++- 4 files changed, 74 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 32517683351f..6d324141588c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1379,7 +1379,47 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, return; /* Otherwise replace it */ - pmu_format_value(format->bits, val, vp, /*zero=*/true); + perf_pmu__format_pack(format->bits, val, vp, /*zero=*/true); +} + + +int evsel__get_config_val(const struct evsel *evsel, const char *config_name, + u64 *val) +{ + struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format, config_name); + + if (!format || bitmap_empty(format->bits, PERF_PMU_FORMAT_BITS)) { + pr_err("Unknown/empty format name: %s\n", config_name); + *val = 0; + return -EINVAL; + } + + switch (format->value) { + case PERF_PMU_FORMAT_VALUE_CONFIG: + *val = perf_pmu__format_unpack(format->bits, + evsel->core.attr.config); + return 0; + case PERF_PMU_FORMAT_VALUE_CONFIG1: + *val = perf_pmu__format_unpack(format->bits, + evsel->core.attr.config1); + return 0; + case PERF_PMU_FORMAT_VALUE_CONFIG2: + *val = perf_pmu__format_unpack(format->bits, + evsel->core.attr.config2); + return 0; + case PERF_PMU_FORMAT_VALUE_CONFIG3: + *val = perf_pmu__format_unpack(format->bits, + evsel->core.attr.config3); + return 0; + case PERF_PMU_FORMAT_VALUE_CONFIG4: + *val = perf_pmu__format_unpack(format->bits, + evsel->core.attr.config4); + return 0; + default: + pr_err("Unknown format value: %d\n", format->value); + *val = 0; + return -EINVAL; + } } void __weak arch_evsel__set_sample_weight(struct evsel *evsel) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 2cf87bc67df7..95c4bd0f0f2e 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -575,6 +575,8 @@ void evsel__uniquify_counter(struct evsel *counter); ((((src) >> (pos)) & ((1ull << (size)) - 1)) << (63 - ((pos) + (size) - 1))) u64 evsel__bitfield_swap_branch_flags(u64 value); +int evsel__get_config_val(const struct evsel *evsel, const char *config_name, + u64 *val); void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name, u64 val); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index dc5dab69151f..bb399a47d2b4 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1337,6 +1337,26 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) } } +/* + * Unpacks a raw config[n] value using the sparse bitfield that defines a + * format attr. For example "config1:1,6-7,44" defines a 4 bit value across non + * contiguous bits and this function returns those 4 bits as a value. + */ +u64 perf_pmu__format_unpack(unsigned long *format, u64 config_val) +{ + int val_bit = 0; + u64 res = 0; + int fmt_bit; + + for_each_set_bit(fmt_bit, format, PERF_PMU_FORMAT_BITS) { + if (config_val & (1ULL << fmt_bit)) + res |= BIT_ULL(val_bit); + + val_bit++; + } + return res; +} + struct perf_pmu_format *pmu_find_format(const struct list_head *formats, const char *name) { @@ -1379,7 +1399,8 @@ int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name) * Sets value based on the format definition (format parameter) * and unformatted value (value parameter). */ -void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero) +void perf_pmu__format_pack(unsigned long *format, __u64 value, __u64 *v, + bool zero) { unsigned long fbit, vbit; @@ -1496,23 +1517,23 @@ static int pmu_config_term(const struct perf_pmu *pmu, switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_CONFIG: assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - pmu_format_value(bits, term->val.num, &attr->config, zero); + perf_pmu__format_pack(bits, term->val.num, &attr->config, zero); break; case PARSE_EVENTS__TERM_TYPE_CONFIG1: assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - pmu_format_value(bits, term->val.num, &attr->config1, zero); + perf_pmu__format_pack(bits, term->val.num, &attr->config1, zero); break; case PARSE_EVENTS__TERM_TYPE_CONFIG2: assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - pmu_format_value(bits, term->val.num, &attr->config2, zero); + perf_pmu__format_pack(bits, term->val.num, &attr->config2, zero); break; case PARSE_EVENTS__TERM_TYPE_CONFIG3: assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - pmu_format_value(bits, term->val.num, &attr->config3, zero); + perf_pmu__format_pack(bits, term->val.num, &attr->config3, zero); break; case PARSE_EVENTS__TERM_TYPE_CONFIG4: assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - pmu_format_value(bits, term->val.num, &attr->config4, zero); + perf_pmu__format_pack(bits, term->val.num, &attr->config4, zero); break; case PARSE_EVENTS__TERM_TYPE_LEGACY_HARDWARE_CONFIG: assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); @@ -1650,7 +1671,7 @@ static int pmu_config_term(const struct perf_pmu *pmu, */ } - pmu_format_value(format->bits, val, vp, zero); + perf_pmu__format_pack(format->bits, val, vp, zero); return 0; } diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 7655d996090a..7ef90b54a149 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -279,12 +279,14 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_ u64 *alternate_hw_config, struct parse_events_error *err); int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb); -void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero); +void perf_pmu__format_pack(unsigned long *format, __u64 value, __u64 *v, + bool zero); struct perf_pmu_format *pmu_find_format(const struct list_head *formats, const char *name); void perf_pmu_format__set_value(void *format, int config, unsigned long *bits); bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name); int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb); +u64 perf_pmu__format_unpack(unsigned long *format, u64 config_val); bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); -- cgit v1.2.3 From 6f87719b8ae170448348e56a82228ca39a3336a6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:20 +0000 Subject: perf parse-events: Always track user config changes Requiring the 'pmu->perf_event_attr_init_default' callback to be set to track user changes is a bit of a trap to fall in. It's hard to see that this is required when depending on the user change tracking. It's possible to want all 0 defaults so not set it, but at the same time still do some programmatic setting of configs with evsel__set_config_if_unset(). Also if a PMU reverts to 0 defaults and deletes its existing callback, it will silently break existing uses of evsel__set_config_if_unset(). One way to fix this would be to assert in evsel__set_config_if_unset() if the changes weren't tracked, but that would be a possibly untested runtime failure. Instead, always track it as it's harmless and simplifies testing too. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1f6e2213326d..c8f2962a06c7 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1528,12 +1528,8 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, return -ENOMEM; } - /* - * When using default config, record which bits of attr->config were - * changed by the user. - */ - if (pmu->perf_event_attr_init_default && - get_config_chgs(pmu, &parsed_terms, &config_terms)) { + /* Record which bits of attr->config were changed by the user. */ + if (get_config_chgs(pmu, &parsed_terms, &config_terms)) { parse_events_terms__exit(&parsed_terms); return -ENOMEM; } -- cgit v1.2.3 From 8e2ef85c66dc65b61ca16be2650936387dc0d583 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:21 +0000 Subject: perf tests: Test evsel__set_config_if_unset() and config change tracking Test that evsel__set_config_if_unset() behaves as expected. This also tests the user config change tracking mechanism as it depends on it. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index cbded2c6faa4..0ebf2d7b2cb4 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -192,12 +192,102 @@ static int test__pmu_format(struct test_suite *test __maybe_unused, int subtest } if (attr.config2 != 0x0400000020041d07) { pr_err("Unexpected config2 value %llx\n", attr.config2); + } + + ret = TEST_OK; +err_out: + parse_events_terms__exit(&terms); + test_pmu_put(dir, pmu); + return ret; +} + +static int test__pmu_usr_chgs(struct test_suite *test __maybe_unused, int subtest __maybe_unused) +{ + const char *event = "perf-pmu-test/config=15,config1=4,krava02=170," + "krava03=1,krava11=27,krava12=1/"; + struct parse_events_terms terms; + struct parse_events_error err; + LIST_HEAD(config_terms); + struct evlist *evlist; + struct perf_pmu *pmu; + struct evsel *evsel; + int ret = TEST_FAIL; + char dir[PATH_MAX]; + u64 val; + + pmu = test_pmu_get(dir, sizeof(dir)); + if (!pmu) + return TEST_FAIL; + + evlist = evlist__new(); + if (evlist == NULL) { + pr_err("Failed allocation"); + goto err_out; + } + + parse_events_terms__init(&terms); + ret = parse_events(evlist, event, &err); + if (ret) { + pr_debug("failed to parse event '%s', err %d\n", event, ret); + parse_events_error__print(&err, event); + if (parse_events_error__contains(&err, "can't access trace events")) + ret = TEST_SKIP; goto err_out; } + evsel = evlist__first(evlist); + + /* + * Set via config=15, krava01 bits 0-1 + * Set via config1=4, krava11 bit 1 + * Set values: krava02=170, krava03=1, krava11=27, krava12=1 + * + * Test that already set values aren't overwritten. + */ + evsel__set_config_if_unset(evsel, "krava01", 16); + evsel__get_config_val(evsel, "krava01", &val); + TEST_ASSERT_EQUAL("krava01 overwritten", (int) val, (15 & 0b11)); + + evsel__set_config_if_unset(evsel, "krava11", 45); + evsel__get_config_val(evsel, "krava11", &val); + TEST_ASSERT_EQUAL("krava11 overwritten", (int) val, (27 | (4 << 1))); + + evsel__set_config_if_unset(evsel, "krava02", 32); + evsel__get_config_val(evsel, "krava02", &val); + TEST_ASSERT_EQUAL("krava02 overwritten", (int) val, 170); + + evsel__set_config_if_unset(evsel, "krava03", 0); + evsel__get_config_val(evsel, "krava03", &val); + TEST_ASSERT_EQUAL("krava03 overwritten", (int) val, 1); + + /* + * krava13 doesn't have any bits set by either krava13= or config1= + * but setting _any_ raw value for config1 implies that krava13 + * shouldn't be overwritten. So it's value should remain as 0. + */ + evsel__set_config_if_unset(evsel, "krava13", 5); + evsel__get_config_val(evsel, "krava13", &val); + TEST_ASSERT_EQUAL("krava13 overwritten", (int) val, 0); + + /* + * Unset values: krava21, krava22, krava23 + * + * Test that unset values are overwritten. + */ + evsel__set_config_if_unset(evsel, "krava21", 13905); + evsel__get_config_val(evsel, "krava21", &val); + TEST_ASSERT_EQUAL("krava21 not overwritten", (int) val, 13905); + + evsel__set_config_if_unset(evsel, "krava22", 11); + evsel__get_config_val(evsel, "krava22", &val); + TEST_ASSERT_EQUAL("krava22 not overwritten", (int) val, 11); + evsel__set_config_if_unset(evsel, "krava23", 0); + evsel__get_config_val(evsel, "krava23", &val); + TEST_ASSERT_EQUAL("krava23 not overwritten", (int) val, 0); ret = TEST_OK; err_out: parse_events_terms__exit(&terms); + evlist__delete(evlist); test_pmu_put(dir, pmu); return ret; } @@ -539,6 +629,7 @@ static struct test_case tests__pmu[] = { TEST_CASE("PMU name combining", name_len), TEST_CASE("PMU name comparison", name_cmp), TEST_CASE("PMU cmdline match", pmu_match), + TEST_CASE("PMU user config changes", pmu_usr_chgs), { .name = NULL, } }; -- cgit v1.2.3 From 4c2efb230a76d9dcdf0e4c39d1116df08312e740 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:22 +0000 Subject: perf cs-etm: Make a helper to find the Coresight evsel This pattern occurs a few times and we'll add another one later, so add a helper function for it. Reviewed-by: Ian Rogers Reviewed-by: Leo Yan Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 50 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index c28208361d91..a49753f0d20f 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -302,6 +302,19 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu, return 0; } +static struct evsel *cs_etm_get_evsel(struct evlist *evlist, + struct perf_pmu *cs_etm_pmu) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type == cs_etm_pmu->type) + return evsel; + } + + return NULL; +} + static int cs_etm_recording_options(struct auxtrace_record *itr, struct evlist *evlist, struct record_opts *opts) @@ -473,29 +486,21 @@ out: static u64 cs_etm_get_config(struct auxtrace_record *itr) { - u64 config = 0; struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; struct evlist *evlist = ptr->evlist; - struct evsel *evsel; + struct evsel *evsel = cs_etm_get_evsel(evlist, cs_etm_pmu); - evlist__for_each_entry(evlist, evsel) { - if (evsel->core.attr.type == cs_etm_pmu->type) { - /* - * Variable perf_event_attr::config is assigned to - * ETMv3/PTM. The bit fields have been made to match - * the ETMv3.5 ETRMCR register specification. See the - * PMU_FORMAT_ATTR() declarations in - * drivers/hwtracing/coresight/coresight-perf.c for - * details. - */ - config = evsel->core.attr.config; - break; - } - } - - return config; + /* + * Variable perf_event_attr::config is assigned to + * ETMv3/PTM. The bit fields have been made to match + * the ETMv3.5 ETRMCR register specification. See the + * PMU_FORMAT_ATTR() declarations in + * drivers/hwtracing/coresight/coresight-perf.c for + * details. + */ + return evsel ? evsel->core.attr.config : 0; } #ifndef BIT @@ -829,12 +834,11 @@ static int cs_etm_snapshot_start(struct auxtrace_record *itr) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); - struct evsel *evsel; + struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, ptr->cs_etm_pmu); + + if (evsel) + return evsel__disable(evsel); - evlist__for_each_entry(ptr->evlist, evsel) { - if (evsel->core.attr.type == ptr->cs_etm_pmu->type) - return evsel__disable(evsel); - } return -EINVAL; } -- cgit v1.2.3 From 4ffd443f5d1fc85740ac60f9ccd0200fab42f95e Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:23 +0000 Subject: perf cs-etm: Don't use hard coded config bits when setting up ETMCR Perf only looks at attr.config when determining what was programmed into ETMCR. These bits could theoretically be in any of the config fields. Add a generic helper to find the value of any named format field in any config field and then use it to get the attributes relevant to ETMCR. The kernel will also stop publishing the ETMCR register bits in a header [1] so preempt that by defining them here. Move field_prep() to util.h so we can define it along side field_get(). Unfortunately FIELD_PREP() and FIELD_GET() from the kernel can't be used as they require the mask to be a compile time constant. [1]: https://lore.kernel.org/linux-arm-kernel/20251128-james-cs-syncfreq-v8-10-4d319764cc58@linaro.org/ Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index a49753f0d20f..f535027ce862 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -68,6 +68,12 @@ static const char * const metadata_ete_ro[] = { enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE }; + +/* ETMv3 ETMCR register bits */ +#define ETMCR_CYC_ACC BIT(12) +#define ETMCR_TIMESTAMP_EN BIT(28) +#define ETMCR_RETURN_STACK BIT(29) + static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu); static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val); static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path); @@ -484,6 +490,33 @@ out: return err; } +static u64 cs_etm_synth_etmcr(struct auxtrace_record *itr) +{ + struct cs_etm_recording *ptr = + container_of(itr, struct cs_etm_recording, itr); + struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; + struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, cs_etm_pmu); + u64 etmcr = 0; + u64 val; + + if (!evsel) + return 0; + + /* + * Synthesize what the kernel programmed into ETMCR based on + * what options the event was opened with. This doesn't have to be + * complete or 100% accurate, not all bits used by OpenCSD anyway. + */ + if (!evsel__get_config_val(evsel, "cycacc", &val) && val) + etmcr |= ETMCR_CYC_ACC; + if (!evsel__get_config_val(evsel, "timestamp", &val) && val) + etmcr |= ETMCR_TIMESTAMP_EN; + if (!evsel__get_config_val(evsel, "retstack", &val) && val) + etmcr |= ETMCR_RETURN_STACK; + + return etmcr; +} + static u64 cs_etm_get_config(struct auxtrace_record *itr) { struct cs_etm_recording *ptr = @@ -743,7 +776,7 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, case CS_ETMV3: magic = __perf_cs_etmv3_magic; /* Get configuration register */ - info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr); + info->priv[*offset + CS_ETM_ETMCR] = cs_etm_synth_etmcr(itr); /* traceID set to legacy value in case new perf running on old system */ info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu); /* Get read-only information from sysFS */ -- cgit v1.2.3 From 3f620f26576526ccc9e5cb1164bd1cf33a7c70bd Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:24 +0000 Subject: perf cs-etm: Don't use hard coded config bits when setting up TRCCONFIGR Perf only looks at attr.config when determining what was programmed into TRCCONFIGR. These bits could theoretically be in any of the config fields. Use the evsel__get_config_val() helper so it's agnostic to which config field they are in. The kernel will also stop publishing the TRCCONFIGR register bits in a header [1] so preempt that by defining them here. [1]: https://lore.kernel.org/linux-arm-kernel/20251128-james-cs-syncfreq-v8-10-4d319764cc58@linaro.org/ Reviewed-by: Ian Rogers Reviewed-by: Leo Yan Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 79 +++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 45 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index f535027ce862..12b28562c2f3 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -68,6 +68,14 @@ static const char * const metadata_ete_ro[] = { enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE }; +/* ETMv4 CONFIGR register bits */ +#define TRCCONFIGR_BB BIT(3) +#define TRCCONFIGR_CCI BIT(4) +#define TRCCONFIGR_CID BIT(6) +#define TRCCONFIGR_VMID BIT(7) +#define TRCCONFIGR_TS BIT(11) +#define TRCCONFIGR_RS BIT(12) +#define TRCCONFIGR_VMIDOPT BIT(15) /* ETMv3 ETMCR register bits */ #define ETMCR_CYC_ACC BIT(12) @@ -517,56 +525,37 @@ static u64 cs_etm_synth_etmcr(struct auxtrace_record *itr) return etmcr; } -static u64 cs_etm_get_config(struct auxtrace_record *itr) +static u64 cs_etmv4_synth_trcconfigr(struct auxtrace_record *itr) { + u64 trcconfigr = 0; struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); + container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - struct evlist *evlist = ptr->evlist; - struct evsel *evsel = cs_etm_get_evsel(evlist, cs_etm_pmu); - - /* - * Variable perf_event_attr::config is assigned to - * ETMv3/PTM. The bit fields have been made to match - * the ETMv3.5 ETRMCR register specification. See the - * PMU_FORMAT_ATTR() declarations in - * drivers/hwtracing/coresight/coresight-perf.c for - * details. - */ - return evsel ? evsel->core.attr.config : 0; -} - -#ifndef BIT -#define BIT(N) (1UL << (N)) -#endif + struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, cs_etm_pmu); + u64 val; -static u64 cs_etmv4_get_config(struct auxtrace_record *itr) -{ - u64 config = 0; - u64 config_opts = 0; + if (!evsel) + return 0; /* - * The perf event variable config bits represent both - * the command line options and register programming - * bits in ETMv3/PTM. For ETMv4 we must remap options - * to real bits + * Synthesize what the kernel programmed into TRCCONFIGR based on + * what options the event was opened with. This doesn't have to be + * complete or 100% accurate, not all bits used by OpenCSD anyway. */ - config_opts = cs_etm_get_config(itr); - if (config_opts & BIT(ETM_OPT_CYCACC)) - config |= BIT(ETM4_CFG_BIT_CYCACC); - if (config_opts & BIT(ETM_OPT_CTXTID)) - config |= BIT(ETM4_CFG_BIT_CTXTID); - if (config_opts & BIT(ETM_OPT_TS)) - config |= BIT(ETM4_CFG_BIT_TS); - if (config_opts & BIT(ETM_OPT_RETSTK)) - config |= BIT(ETM4_CFG_BIT_RETSTK); - if (config_opts & BIT(ETM_OPT_CTXTID2)) - config |= BIT(ETM4_CFG_BIT_VMID) | - BIT(ETM4_CFG_BIT_VMID_OPT); - if (config_opts & BIT(ETM_OPT_BRANCH_BROADCAST)) - config |= BIT(ETM4_CFG_BIT_BB); - - return config; + if (!evsel__get_config_val(evsel, "cycacc", &val) && val) + trcconfigr |= TRCCONFIGR_CCI; + if (!evsel__get_config_val(evsel, "contextid1", &val) && val) + trcconfigr |= TRCCONFIGR_CID; + if (!evsel__get_config_val(evsel, "timestamp", &val) && val) + trcconfigr |= TRCCONFIGR_TS; + if (!evsel__get_config_val(evsel, "retstack", &val) && val) + trcconfigr |= TRCCONFIGR_RS; + if (!evsel__get_config_val(evsel, "contextid2", &val) && val) + trcconfigr |= TRCCONFIGR_VMID | TRCCONFIGR_VMIDOPT; + if (!evsel__get_config_val(evsel, "branch_broadcast", &val) && val) + trcconfigr |= TRCCONFIGR_BB; + + return trcconfigr; } static size_t @@ -688,7 +677,7 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; /* Get trace configuration register */ - data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr); + data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_synth_trcconfigr(itr); /* traceID set to legacy version, in case new perf running on older system */ data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu); @@ -720,7 +709,7 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, st struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; /* Get trace configuration register */ - data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr); + data[CS_ETE_TRCCONFIGR] = cs_etmv4_synth_trcconfigr(itr); /* traceID set to legacy version, in case new perf running on older system */ data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu); -- cgit v1.2.3 From 5e63706f1bc1446e40a8643d05a9842ebad4ec34 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:25 +0000 Subject: perf cs-etm: Don't hard code config attribute when configuring the event These instances of hard coded config attributes are used for configuring and validating the event options. Use the config attribute that's published by the driver by replacing the open coded operations with evsel__get_config_val() and evsel__set_config_if_unset(). Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 56 +++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 12b28562c2f3..dc3f4e86b075 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -103,13 +103,14 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel struct perf_cpu cpu) { int err; - __u64 val; - u64 contextid = evsel->core.attr.config & - (perf_pmu__format_bits(cs_etm_pmu, "contextid") | - perf_pmu__format_bits(cs_etm_pmu, "contextid1") | - perf_pmu__format_bits(cs_etm_pmu, "contextid2")); + u64 ctxt, ctxt1, ctxt2; + __u64 trcidr2; - if (!contextid) + evsel__get_config_val(evsel, "contextid", &ctxt); + evsel__get_config_val(evsel, "contextid1", &ctxt1); + evsel__get_config_val(evsel, "contextid2", &ctxt2); + + if (!ctxt && !ctxt1 && !ctxt2) return 0; /* Not supported in etmv3 */ @@ -120,12 +121,11 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel } /* Get a handle on TRCIDR2 */ - err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &val); + err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &trcidr2); if (err) return err; - if (contextid & - perf_pmu__format_bits(cs_etm_pmu, "contextid1")) { + if (ctxt1) { /* * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID * tracing is supported: @@ -133,15 +133,14 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel * 0b00100 Maximum of 32-bit Context ID size. * All other values are reserved. */ - if (BMVAL(val, 5, 9) != 0x4) { + if (BMVAL(trcidr2, 5, 9) != 0x4) { pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; } } - if (contextid & - perf_pmu__format_bits(cs_etm_pmu, "contextid2")) { + if (ctxt2) { /* * TRCIDR2.VMIDOPT[30:29] != 0 and * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid) @@ -149,7 +148,7 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel * virtual context id is < 32bit. * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us. */ - if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) { + if (!BMVAL(trcidr2, 29, 30) || BMVAL(trcidr2, 10, 14) < 4) { pr_err("%s: CONTEXTIDR_EL2 isn't supported, disable with %s/contextid2=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; @@ -163,10 +162,11 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel * struct perf_cpu cpu) { int err; - __u64 val; + u64 val; + __u64 trcidr0; - if (!(evsel->core.attr.config & - perf_pmu__format_bits(cs_etm_pmu, "timestamp"))) + evsel__get_config_val(evsel, "timestamp", &val); + if (!val) return 0; if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) { @@ -176,7 +176,7 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel * } /* Get a handle on TRCIRD0 */ - err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &val); + err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &trcidr0); if (err) return err; @@ -187,10 +187,9 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel * * 0b00110 Implementation supports a maximum timestamp of 48bits. * 0b01000 Implementation supports a maximum timestamp of 64bits. */ - val &= GENMASK(28, 24); - if (!val) { + trcidr0 &= GENMASK(28, 24); + if (!trcidr0) return -EINVAL; - } return 0; } @@ -273,16 +272,19 @@ static int cs_etm_parse_snapshot_options(struct auxtrace_record *itr, return 0; } +/* + * If the sink name format "@sink_name" is used, lookup the sink by name to convert to + * "sinkid=sink_hash" format. If the user has already manually provided a hash then + * "sinkid" isn't overwritten. If neither are provided then the driver will pick the best + * sink. + */ static int cs_etm_set_sink_attr(struct perf_pmu *pmu, struct evsel *evsel) { char msg[BUFSIZ], path[PATH_MAX], *sink; struct evsel_config_term *term; - int ret = -EINVAL; u32 hash; - - if (evsel->core.attr.config2 & GENMASK(31, 0)) - return 0; + int ret; list_for_each_entry(term, &evsel->config_terms, list) { if (term->type != EVSEL__CONFIG_TERM_DRV_CFG) @@ -305,14 +307,10 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu, return ret; } - evsel->core.attr.config2 |= hash; + evsel__set_config_if_unset(evsel, "sinkid", hash); return 0; } - /* - * No sink was provided on the command line - allow the CoreSight - * system to look for a default - */ return 0; } -- cgit v1.2.3 From 571d29baa07e83e637075239f379f91353c24ec9 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 14 Jan 2026 15:57:26 +0000 Subject: perf arm-spe: Don't hard code config attribute Use the config attribute that's published by the driver instead of hard coding "attr.config". Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/arm-spe.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 51014f8bff97..17ced7bbbdda 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -256,7 +256,7 @@ static __u64 arm_spe_pmu__sample_period(const struct perf_pmu *arm_spe_pmu) static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus) { - u64 bit; + u64 pa_enable_bit; evsel->core.attr.freq = 0; evsel->core.attr.sample_period = arm_spe_pmu__sample_period(evsel->pmu); @@ -288,9 +288,10 @@ static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus) * inform that the resulting output's SPE samples contain physical addresses * where applicable. */ - bit = perf_pmu__format_bits(evsel->pmu, "pa_enable"); - if (evsel->core.attr.config & bit) - evsel__set_sample_bit(evsel, PHYS_ADDR); + + if (!evsel__get_config_val(evsel, "pa_enable", &pa_enable_bit)) + if (pa_enable_bit) + evsel__set_sample_bit(evsel, PHYS_ADDR); } static int arm_spe_setup_aux_buffer(struct record_opts *opts) @@ -397,6 +398,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, struct perf_cpu_map *cpus = evlist->core.user_requested_cpus; bool discard = false; int err; + u64 discard_bit; sper->evlist = evlist; @@ -425,9 +427,8 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, evlist__for_each_entry_safe(evlist, tmp, evsel) { if (evsel__is_aux_event(evsel)) { arm_spe_setup_evsel(evsel, cpus); - if (evsel->core.attr.config & - perf_pmu__format_bits(evsel->pmu, "discard")) - discard = true; + if (!evsel__get_config_val(evsel, "discard", &discard_bit)) + discard = !!discard_bit; } } -- cgit v1.2.3 From 7c8e817e443c118aa303f1bbcec33df8d9e3487a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 14 Jan 2026 16:25:44 +0000 Subject: selftests/bpf: Extend live regs tests with a test for gotox Add a test which checks that the destination register of a gotox instruction is marked as used and that the union of jump targets is considered as live. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260114162544.83253-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/compute_live_registers.c | 41 ++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index 6884ab99a421..f05e120f3450 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -431,6 +431,47 @@ __naked void subprog1(void) ::: __clobber_all); } +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +SEC("socket") +__log_level(2) +__msg("2: .1........ (07) r1 += 8") +__msg("3: .1........ (79) r2 = *(u64 *)(r1 +0)") +__msg("4: ..2....... (b7) r3 = 1") +__msg("5: ..23...... (b7) r4 = 2") +__msg("6: ..234..... (0d) gotox r2") +__msg("7: ...3...... (bf) r0 = r3") +__msg("8: 0......... (95) exit") +__msg("9: ....4..... (bf) r0 = r4") +__msg("10: 0......... (95) exit") +__naked +void gotox(void) +{ + asm volatile ( + ".pushsection .jumptables,\"\",@progbits;" +"jt0_%=: .quad l0_%= - socket;" + ".quad l1_%= - socket;" + ".size jt0_%=, 16;" + ".global jt0_%=;" + ".popsection;" + + "r1 = jt0_%= ll;" + "r1 += 8;" + "r2 = *(u64 *)(r1 + 0);" + "r3 = 1;" + "r4 = 2;" + ".8byte %[gotox_r2];" +"l0_%=: r0 = r3;" + "exit;" +"l1_%=: r0 = r4;" + "exit;" + : + : __imm_insn(gotox_r2, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_2, BPF_REG_0, 0, 0)) + : __clobber_all); +} + +#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */ + /* to retain debug info for BTF generation */ void kfunc_root(void) { -- cgit v1.2.3 From 0ace8f2db6b3b4b0677e559d1a7ab7fd625d61ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jan 2026 20:11:48 +0000 Subject: tools/testing/selftests: add tests for !tgt, src mremap() merges Test that mremap()'ing a VMA into a position such that the target VMA on merge is unfaulted and the source faulted is correctly performed. We cover 4 cases: 1. Previous VMA unfaulted: copied -----| v |-----------|.............| | unfaulted |(faulted VMA)| |-----------|.............| prev target = prev, expand prev to cover. 2. Next VMA unfaulted: copied -----| v |.............|-----------| |(faulted VMA)| unfaulted | |.............|-----------| next target = next, expand next to cover. 3. Both adjacent VMAs unfaulted: copied -----| v |-----------|.............|-----------| | unfaulted |(faulted VMA)| unfaulted | |-----------|.............|-----------| prev next target = prev, expand prev to cover. 4. prev unfaulted, next faulted: copied -----| v |-----------|.............|-----------| | unfaulted |(faulted VMA)| faulted | |-----------|.............|-----------| prev next target = prev, expand prev to cover. Essentially equivalent to 3, but with additional requirement that next's anon_vma is the same as the copied VMA's. Each of these are performed with MREMAP_DONTUNMAP set, which will cause a KASAN assert for UAF or an assert on zero refcount anon_vma if a bug exists with correctly propagating anon_vma state in each scenario. Link: https://lkml.kernel.org/r/f903af2930c7c2c6e0948c886b58d0f42d8e8ba3.1767638272.git.lorenzo.stoakes@oracle.com Fixes: 879bca0a2c4f ("mm/vma: fix incorrectly disallowed anonymous VMA merges") Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand (Red Hat) Cc: Jann Horn Cc: Jeongjun Park Cc: Liam Howlett Cc: Pedro Falcato Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yeoreum Yun Cc: Harry Yoo Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/merge.c | 232 +++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index 363c1033cc7d..22be149f7109 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -1171,4 +1171,236 @@ TEST_F(merge, mremap_correct_placed_faulted) ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size); } +TEST_F(merge, mremap_faulted_to_unfaulted_prev) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b; + + /* + * mremap() such that A and B merge: + * + * |------------| + * | \ | + * |-----------| | / |---------| + * | unfaulted | v \ | faulted | + * |-----------| / |---------| + * B \ A + */ + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size + 3 * page_size], + 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + /* Fault it in. */ + ptr_a[0] = 'x'; + + /* + * Now move it out of the way so we can place VMA B in position, + * unfaulted. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* + * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); +} + +TEST_F(merge, mremap_faulted_to_unfaulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b; + + /* + * mremap() such that A and B merge: + * + * |---------------------------| + * | \ | + * | |-----------| / |---------| + * v | unfaulted | \ | faulted | + * |-----------| / |---------| + * B \ A + * + * Then unmap VMA A to trigger the bug. + */ + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + /* Fault it in. */ + ptr_a[0] = 'x'; + + /* + * Now move it out of the way so we can place VMA B in position, + * unfaulted. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* + * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 6 * page_size); +} + +TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b, *ptr_c; + + /* + * mremap() with MREMAP_DONTUNMAP such that A, B and C merge: + * + * |---------------------------| + * | \ | + * |-----------| | |-----------| / |---------| + * | unfaulted | v | unfaulted | \ | faulted | + * |-----------| |-----------| / |---------| + * A C \ B + */ + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + /* Fault it in. */ + ptr_b[0] = 'x'; + + /* + * Now move it out of the way so we can place VMAs A, C in position, + * unfaulted. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* Map VMA A into place. */ + + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA C into place. */ + ptr_c = mmap(&self->carveout[page_size + 3 * page_size + 3 * page_size], + 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_c, MAP_FAILED); + + /* + * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); +} + +TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b, *ptr_bc; + + /* + * mremap() with MREMAP_DONTUNMAP such that A, B and C merge: + * + * |---------------------------| + * | \ | + * |-----------| | |-----------| / |---------| + * | unfaulted | v | faulted | \ | faulted | + * |-----------| |-----------| / |---------| + * A C \ B + */ + + /* + * Map VMA B and C into place. We have to map them together so their + * anon_vma is the same and the vma->vm_pgoff's are correctly aligned. + */ + ptr_bc = mmap(&self->carveout[page_size + 3 * page_size], + 3 * page_size + 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_bc, MAP_FAILED); + + /* Fault it in. */ + ptr_bc[0] = 'x'; + + /* + * Now move VMA B out the way (splitting VMA BC) so we can place VMA A + * in position, unfaulted, and leave the remainder of the VMA we just + * moved in place, faulted, as VMA C. + */ + ptr_b = mremap(ptr_bc, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* + * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From fb39444732f02c32a8312c168d97e33d872c14d3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jan 2026 20:11:50 +0000 Subject: tools/testing/selftests: add forked (un)/faulted VMA merge tests Now we correctly handle forked faulted/unfaulted merge on mremap(), exhaustively assert that we handle this correctly. Do this in the less duplicative way by adding a new merge_with_fork fixture and forked/unforked variants, and abstract the forking logic as necessary to avoid code duplication with this also. Link: https://lkml.kernel.org/r/1daf76d89fdb9d96f38a6a0152d8f3c2e9e30ac7.1767638272.git.lorenzo.stoakes@oracle.com Fixes: 879bca0a2c4f ("mm/vma: fix incorrectly disallowed anonymous VMA merges") Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand (Red Hat) Cc: Jann Horn Cc: Jeongjun Park Cc: Liam Howlett Cc: Pedro Falcato Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yeoreum Yun Cc: Harry Yoo Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/merge.c | 180 ++++++++++++++++++++++++++++--------- 1 file changed, 139 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index 22be149f7109..10b686102b79 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -22,12 +22,37 @@ FIXTURE(merge) struct procmap_fd procmap; }; +static char *map_carveout(unsigned int page_size) +{ + return mmap(NULL, 30 * page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); +} + +static pid_t do_fork(struct procmap_fd *procmap) +{ + pid_t pid = fork(); + + if (pid == -1) + return -1; + if (pid != 0) { + wait(NULL); + return pid; + } + + /* Reopen for child. */ + if (close_procmap(procmap)) + return -1; + if (open_self_procmap(procmap)) + return -1; + + return 0; +} + FIXTURE_SETUP(merge) { self->page_size = psize(); /* Carve out PROT_NONE region to map over. */ - self->carveout = mmap(NULL, 30 * self->page_size, PROT_NONE, - MAP_ANON | MAP_PRIVATE, -1, 0); + self->carveout = map_carveout(self->page_size); ASSERT_NE(self->carveout, MAP_FAILED); /* Setup PROCMAP_QUERY interface. */ ASSERT_EQ(open_self_procmap(&self->procmap), 0); @@ -36,7 +61,8 @@ FIXTURE_SETUP(merge) FIXTURE_TEARDOWN(merge) { ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0); - ASSERT_EQ(close_procmap(&self->procmap), 0); + /* May fail for parent of forked process. */ + close_procmap(&self->procmap); /* * Clear unconditionally, as some tests set this. It is no issue if this * fails (KSM may be disabled for instance). @@ -44,6 +70,44 @@ FIXTURE_TEARDOWN(merge) prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); } +FIXTURE(merge_with_fork) +{ + unsigned int page_size; + char *carveout; + struct procmap_fd procmap; +}; + +FIXTURE_VARIANT(merge_with_fork) +{ + bool forked; +}; + +FIXTURE_VARIANT_ADD(merge_with_fork, forked) +{ + .forked = true, +}; + +FIXTURE_VARIANT_ADD(merge_with_fork, unforked) +{ + .forked = false, +}; + +FIXTURE_SETUP(merge_with_fork) +{ + self->page_size = psize(); + self->carveout = map_carveout(self->page_size); + ASSERT_NE(self->carveout, MAP_FAILED); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); +} + +FIXTURE_TEARDOWN(merge_with_fork) +{ + ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0); + ASSERT_EQ(close_procmap(&self->procmap), 0); + /* See above. */ + prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); +} + TEST_F(merge, mprotect_unfaulted_left) { unsigned int page_size = self->page_size; @@ -322,8 +386,8 @@ TEST_F(merge, forked_target_vma) unsigned int page_size = self->page_size; char *carveout = self->carveout; struct procmap_fd *procmap = &self->procmap; - pid_t pid; char *ptr, *ptr2; + pid_t pid; int i; /* @@ -344,19 +408,10 @@ TEST_F(merge, forked_target_vma) */ ptr[0] = 'x'; - pid = fork(); + pid = do_fork(&self->procmap); ASSERT_NE(pid, -1); - - if (pid != 0) { - wait(NULL); + if (pid != 0) return; - } - - /* Child process below: */ - - /* Reopen for child. */ - ASSERT_EQ(close_procmap(&self->procmap), 0); - ASSERT_EQ(open_self_procmap(&self->procmap), 0); /* unCOWing everything does not cause the AVC to go away. */ for (i = 0; i < 5 * page_size; i += page_size) @@ -386,8 +441,8 @@ TEST_F(merge, forked_source_vma) unsigned int page_size = self->page_size; char *carveout = self->carveout; struct procmap_fd *procmap = &self->procmap; - pid_t pid; char *ptr, *ptr2; + pid_t pid; int i; /* @@ -408,19 +463,10 @@ TEST_F(merge, forked_source_vma) */ ptr[0] = 'x'; - pid = fork(); + pid = do_fork(&self->procmap); ASSERT_NE(pid, -1); - - if (pid != 0) { - wait(NULL); + if (pid != 0) return; - } - - /* Child process below: */ - - /* Reopen for child. */ - ASSERT_EQ(close_procmap(&self->procmap), 0); - ASSERT_EQ(open_self_procmap(&self->procmap), 0); /* unCOWing everything does not cause the AVC to go away. */ for (i = 0; i < 5 * page_size; i += page_size) @@ -1171,10 +1217,11 @@ TEST_F(merge, mremap_correct_placed_faulted) ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size); } -TEST_F(merge, mremap_faulted_to_unfaulted_prev) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; + unsigned long offset; char *ptr_a, *ptr_b; /* @@ -1197,6 +1244,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev) /* Fault it in. */ ptr_a[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move it out of the way so we can place VMA B in position, * unfaulted. @@ -1220,16 +1275,19 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev) &self->carveout[page_size + 3 * page_size]); ASSERT_NE(ptr_a, MAP_FAILED); - /* The VMAs should have merged. */ + /* The VMAs should have merged, if not forked. */ ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); + + offset = variant->forked ? 3 * page_size : 6 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + offset); } -TEST_F(merge, mremap_faulted_to_unfaulted_next) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_next) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; + unsigned long offset; char *ptr_a, *ptr_b; /* @@ -1253,6 +1311,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_next) /* Fault it in. */ ptr_a[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move it out of the way so we can place VMA B in position, * unfaulted. @@ -1276,16 +1342,18 @@ TEST_F(merge, mremap_faulted_to_unfaulted_next) &self->carveout[page_size]); ASSERT_NE(ptr_a, MAP_FAILED); - /* The VMAs should have merged. */ + /* The VMAs should have merged, if not forked. */ ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 6 * page_size); + offset = variant->forked ? 3 * page_size : 6 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset); } -TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_unfaulted_next) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; + unsigned long offset; char *ptr_a, *ptr_b, *ptr_c; /* @@ -1307,6 +1375,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) /* Fault it in. */ ptr_b[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move it out of the way so we can place VMAs A, C in position, * unfaulted. @@ -1337,13 +1413,21 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) &self->carveout[page_size + 3 * page_size]); ASSERT_NE(ptr_b, MAP_FAILED); - /* The VMAs should have merged. */ + /* The VMAs should have merged, if not forked. */ ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + offset = variant->forked ? 3 * page_size : 9 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset); + + /* If forked, B and C should also not have merged. */ + if (variant->forked) { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 3 * page_size); + } } -TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_faulted_next) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; @@ -1373,6 +1457,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) /* Fault it in. */ ptr_bc[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move VMA B out the way (splitting VMA BC) so we can place VMA A * in position, unfaulted, and leave the remainder of the VMA we just @@ -1397,10 +1489,16 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) &self->carveout[page_size + 3 * page_size]); ASSERT_NE(ptr_b, MAP_FAILED); - /* The VMAs should have merged. */ - ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); - ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + /* The VMAs should have merged. A,B,C if unforked, B, C if forked. */ + if (variant->forked) { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); + } else { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + } } TEST_HARNESS_MAIN -- cgit v1.2.3 From 21c68ad1d9771d331198cc73cbf6e908d7915f35 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 6 Jan 2026 15:45:47 +0000 Subject: tools/testing/selftests: fix gup_longterm for unknown fs Commit 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm") introduced a small bug causing unknown filesystems to always result in a test failure. This is because do_test() was updated to use a common reporting path, but this case appears to have been missed. This is problematic for e.g. virtme-ng which uses an overlayfs file system, causing gup_longterm to appear to fail each time due to a test count mismatch: # Planned tests != run tests (50 != 46) # Totals: pass:24 fail:0 xfail:0 xpass:0 skip:22 error:0 The fix is to simply change the return into a break. Link: https://lkml.kernel.org/r/20260106154547.214907-1-lorenzo.stoakes@oracle.com Fixes: 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm") Signed-off-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Liam Howlett Cc: "Liam R. Howlett" Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_longterm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 6279893a0adc..f61150d28eb2 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -179,7 +179,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) if (rw && shared && fs_is_unknown(fs_type)) { ksft_print_msg("Unknown filesystem\n"); result = KSFT_SKIP; - return; + break; } /* * R/O pinning or pinning in a private mapping is always -- cgit v1.2.3 From d2bdcde9626cbea0c44a6aaa33b440c8adf81e09 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:45 +0800 Subject: perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR With the introduction of the OMR feature, the PEBS memory auxiliary info field for load and store latency events has been restructured for DMR. The memory auxiliary info field's bit[8] indicates whether a L2 cache miss occurred for a memory load or store instruction. If bit[8] is 0, it signifies no L2 cache miss, and bits[7:0] specify the exact cache data source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent the OMR encoding, indicating the specific L3 cache or memory region involved in the memory access. A significant enhancement is OMR encoding provides up to 8 fine-grained memory regions besides the cache region. A significant enhancement for OMR encoding is the ability to provide up to 8 fine-grained memory regions in addition to the cache region, offering more detailed insights into memory access regions. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the ISE documentation. This patch ensures that the PEBS memory auxiliary info field is correctly interpreted and utilized in DMR. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/ds.c | 140 ++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 2 + include/uapi/linux/perf_event.h | 27 ++++++- tools/include/uapi/linux/perf_event.h | 27 ++++++- 4 files changed, 190 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index feb1c3cf63e4..272e652f25fc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -34,6 +34,17 @@ struct pebs_record_32 { */ +union omr_encoding { + struct { + u8 omr_source : 4; + u8 omr_remote : 1; + u8 omr_hitm : 1; + u8 omr_snoop : 1; + u8 omr_promoted : 1; + }; + u8 omr_full; +}; + union intel_x86_pebs_dse { u64 val; struct { @@ -73,6 +84,18 @@ union intel_x86_pebs_dse { unsigned int lnc_addr_blk:1; unsigned int ld_reserved6:18; }; + struct { + unsigned int pnc_dse: 8; + unsigned int pnc_l2_miss:1; + unsigned int pnc_stlb_clean_hit:1; + unsigned int pnc_stlb_any_hit:1; + unsigned int pnc_stlb_miss:1; + unsigned int pnc_locked:1; + unsigned int pnc_data_blk:1; + unsigned int pnc_addr_blk:1; + unsigned int pnc_fb_full:1; + unsigned int ld_reserved8:16; + }; }; @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void) __intel_pmu_pebs_data_source_cmt(data_source); } +/* Version for Panthercove and later */ + +/* L2 hit */ +#define PNC_PEBS_DATA_SOURCE_MAX 16 +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */ + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */ + 0, /* 0x05: Reserved */ + 0, /* 0x06: Reserved */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */ + 0, /* 0x0b: Reserved */ + 0, /* 0x0c: Reserved */ + 0, /* 0x0d: Reserved */ + 0, /* 0x0e: Reserved */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +/* L2 miss */ +#define OMR_DATA_SOURCE_MAX 16 +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */ + 0, /* 0x01: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */ + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */ + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */ +}; + +static u64 parse_omr_data_source(u8 dse) +{ + union omr_encoding omr; + u64 val = 0; + + omr.omr_full = dse; + val = omr_data_source[omr.omr_source]; + if (omr.omr_source > 0x1 && omr.omr_source < 0x7) + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0; + else if (omr.omr_source > 0x7) + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM); + + if (omr.omr_remote) + val |= REM; + + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT); + + if (omr.omr_source == 0x2) { + u8 snoop = omr.omr_snoop | omr.omr_promoted; + + if (snoop == 0x0) + val |= P(SNOOP, NA); + else if (snoop == 0x1) + val |= P(SNOOP, MISS); + else if (snoop == 0x2) + val |= P(SNOOP, HIT); + else if (snoop == 0x3) + val |= P(SNOOP, NONE); + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) { + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0; + } + + return val; +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status) return lnl_latency_data(event, status); } +u64 pnc_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + if (!dse.pnc_l2_miss) + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf]; + else + val = parse_omr_data_source(dse.pnc_dse); + + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.pnc_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.pnc_locked) + val |= P(LOCK, LOCKED); + + if (dse.pnc_data_blk) + val |= P(BLK, DATA); + if (dse.pnc_addr_blk) + val |= P(BLK, ADDR); + if (!dse.pnc_data_blk && !dse.pnc_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 586e3fdfe6d8..bd501c2a0f73 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status); u64 arl_h_latency_data(struct perf_event *event, u64 status); +u64 pnc_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c44a8fb3e418..533393ec94d0 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index c44a8fb3e418..d4b99610a3b0 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) -- cgit v1.2.3 From b638a9d0f8965b98403022cb91d8f3b31170eb35 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:33 +0000 Subject: KVM: arm64: selftests: Add a test for FEAT_IDST Add a very basic test checking that FEAT_IDST actually works for the {GMID,SMIDR,CSSIDR2}_EL1 registers. Link: https://patch.msgid.link/20260108173233.2911955-10-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/arm64/idreg-idst.c | 117 +++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/idreg-idst.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..bfa2fad0aba1 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -175,6 +175,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 +TEST_GEN_PROGS_arm64 += arm64/idreg-idst TEST_GEN_PROGS_arm64 += arm64/kvm-uuid TEST_GEN_PROGS_arm64 += access_tracking_perf_test TEST_GEN_PROGS_arm64 += arch_timer diff --git a/tools/testing/selftests/kvm/arm64/idreg-idst.c b/tools/testing/selftests/kvm/arm64/idreg-idst.c new file mode 100644 index 000000000000..9ca9f125abdb --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/idreg-idst.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Access all FEAT_IDST-handled registers that depend on more than + * just FEAT_AA64, and fail if we don't get an a trap with an 0x18 EC. + */ + +#include +#include +#include + +static volatile bool sys64, undef; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + sys64 = false; \ + undef = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(!undef, #r " unexpected UNDEF"); \ + __GUEST_ASSERT(sys64, #r " didn't trap"); \ + } while(0) + + +static void guest_code(void) +{ + check_sr_read(CCSIDR2_EL1); + check_sr_read(SMIDR_EL1); + check_sr_read(GMID_EL1); + + GUEST_DONE(); +} + +static void guest_sys64_handler(struct ex_regs *regs) +{ + sys64 = true; + undef = false; + regs->pc += 4; +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + sys64 = false; + undef = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_feat_idst(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* This VM has no MTE, no SME, no CCIDX */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_SYS64, guest_sys64_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t mmfr2; + + test_disable_default_vgic(); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + mmfr2 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR2_EL1)); + __TEST_REQUIRE(FIELD_GET(ID_AA64MMFR2_EL1_IDS, mmfr2) > 0, + "FEAT_IDST not supported"); + kvm_vm_free(vm); + + test_guest_feat_idst(); + + return 0; +} -- cgit v1.2.3 From 7e03d07d03a486c66d5c084c7185b1bef29049e9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:14 +0000 Subject: KVM: arm64: selftests: Disable unused TTBR1_EL1 translations KVM selftests map all guest code and data into the lower virtual address range (0x0000...) managed by TTBR0_EL1. The upper range (0xFFFF...) managed by TTBR1_EL1 is unused and uninitialized. If a guest accesses the upper range, the MMU attempts a translation table walk using uninitialized registers, leading to unpredictable behavior. Set `TCR_EL1.EPD1` to disable translation table walks for TTBR1_EL1, ensuring that any access to the upper range generates an immediate Translation Fault. Additionally, set `TCR_EL1.TBI1` (Top Byte Ignore) to ensure that tagged pointers in the upper range also deterministically trigger a Translation Fault via EPD1. Define `TCR_EPD1_MASK`, `TCR_EPD1_SHIFT`, and `TCR_TBI1` in `processor.h` to support this configuration. These are based on their definitions in `arch/arm64/include/asm/pgtable-hwdef.h`. Suggested-by: Will Deacon Reviewed-by: Itaru Kitayama Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-2-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/arm64/processor.h | 4 ++++ tools/testing/selftests/kvm/lib/arm64/processor.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index ff928716574d..ac97a1c436fc 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -90,6 +90,9 @@ #define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) #define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) +#define TCR_EPD1_SHIFT 23 +#define TCR_EPD1_MASK (UL(1) << TCR_EPD1_SHIFT) + #define TCR_IPS_SHIFT 32 #define TCR_IPS_MASK (UL(7) << TCR_IPS_SHIFT) #define TCR_IPS_52_BITS (UL(6) << TCR_IPS_SHIFT) @@ -97,6 +100,7 @@ #define TCR_IPS_40_BITS (UL(2) << TCR_IPS_SHIFT) #define TCR_IPS_36_BITS (UL(1) << TCR_IPS_SHIFT) +#define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_DS (UL(1) << 59) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index d46e4b13b92c..5b379da8cb90 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -384,6 +384,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) tcr_el1 |= TCR_IRGN0_WBWA | TCR_ORGN0_WBWA | TCR_SH0_INNER; tcr_el1 |= TCR_T0SZ(vm->va_bits); + tcr_el1 |= TCR_TBI1; + tcr_el1 |= TCR_EPD1_MASK; if (use_lpa2_pte_format(vm)) tcr_el1 |= TCR_DS; -- cgit v1.2.3 From dd0c5d04d13cae8ff2694ef83d1ae5804d6d9798 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:15 +0000 Subject: KVM: arm64: selftests: Fix incorrect rounding in page_align() The implementation of `page_align()` in `processor.c` calculates alignment incorrectly for values that are already aligned. Specifically, `(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page boundary even if `v` is already page-aligned, potentially wasting a page of memory. Fix the calculation to use standard alignment logic: `(v + vm->page_size - 1) & ~(vm->page_size - 1)`. Fixes: 7a6629ef746d ("kvm: selftests: add virt mem support for aarch64") Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-3-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/arm64/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 5b379da8cb90..607a4e462984 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -23,7 +23,7 @@ static vm_vaddr_t exception_handlers; static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { - return (v + vm->page_size) & ~(vm->page_size - 1); + return (v + vm->page_size - 1) & ~(vm->page_size - 1); } static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) -- cgit v1.2.3 From 582b39463f1c0774e0b3cb5be2118e8564b7941e Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:16 +0000 Subject: KVM: riscv: selftests: Fix incorrect rounding in page_align() The implementation of `page_align()` in `processor.c` calculates alignment incorrectly for values that are already aligned. Specifically, `(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page boundary even if `v` is already page-aligned, potentially wasting a page of memory. Fix the calculation to use standard alignment logic: `(v + vm->page_size - 1) & ~(vm->page_size - 1)`. Fixes: 3e06cdf10520 ("KVM: selftests: Add initial support for RISC-V 64-bit") Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-4-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..d5e8747b5e69 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -28,7 +28,7 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { - return (v + vm->page_size) & ~(vm->page_size - 1); + return (v + vm->page_size - 1) & ~(vm->page_size - 1); } static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) -- cgit v1.2.3 From de00d07321cf3f182762de2308c08062d5b824c0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:17 +0000 Subject: KVM: selftests: Move page_align() to shared header To avoid code duplication, move page_align() to the shared `kvm_util.h` header file. Rename it to vm_page_align(), to make it clear that the alignment is done with respect to the guest's base page size. No functional change intended. Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-5-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++ tools/testing/selftests/kvm/lib/arm64/processor.c | 7 +------ tools/testing/selftests/kvm/lib/riscv/processor.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..747effa614f1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1258,6 +1258,11 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); } +static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size - 1) & ~(vm->page_size - 1); +} + /* * Arch hook that is invoked via a constructor, i.e. before exeucting main(), * to allow for arch-specific setup that is common to all tests, e.g. computing diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 607a4e462984..1605dc740d1e 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -21,11 +21,6 @@ static vm_vaddr_t exception_handlers; -static uint64_t page_align(struct kvm_vm *vm, uint64_t v) -{ - return (v + vm->page_size - 1) & ~(vm->page_size - 1); -} - static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) { unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; @@ -115,7 +110,7 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; + size_t nr_pages = vm_page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; if (vm->pgd_created) return; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index d5e8747b5e69..401245fe31db 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -26,11 +26,6 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) return !ret && !!value; } -static uint64_t page_align(struct kvm_vm *vm, uint64_t v) -{ - return (v + vm->page_size - 1) & ~(vm->page_size - 1); -} - static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) { return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) << @@ -68,7 +63,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; + size_t nr_pages = vm_page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; if (vm->pgd_created) return; -- cgit v1.2.3 From e0a99a2b72f3c6365d9f4d6943ed45f7fc286b70 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:18 +0000 Subject: KVM: selftests: Fix typos and stale comments in kvm_util Fix minor documentation errors in `kvm_util.h` and `kvm_util.c`. - Correct the argument description for `vcpu_args_set` in `kvm_util.h`, which incorrectly listed `vm` instead of `vcpu`. - Fix a typo in the comment for `kvm_selftest_arch_init` ("exeucting" -> "executing"). - Correct the return value description for `vm_vaddr_unused_gap` in `kvm_util.c` to match the implementation, which returns an address "at or above" `vaddr_min`, not "at or below". No functional change intended. Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-6-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 4 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 747effa614f1..97f9251eb073 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -939,7 +939,7 @@ void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); * VM VCPU Args Set * * Input Args: - * vm - Virtual Machine + * vcpu - vCPU * num - number of arguments * ... - arguments, each of type uint64_t * @@ -1264,7 +1264,7 @@ static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v) } /* - * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * Arch hook that is invoked via a constructor, i.e. before executing main(), * to allow for arch-specific setup that is common to all tests, e.g. computing * the default guest "mode". */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..fab6b62d7810 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1351,7 +1351,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) * Output Args: None * * Return: - * Lowest virtual address at or below vaddr_min, with at least + * Lowest virtual address at or above vaddr_min, with at least * sz unused bytes. TEST_ASSERT failure if no area of at least * size sz is available. * -- cgit v1.2.3 From 934d9746ed0206e93506a68c838fe82ef748576a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:57 +0100 Subject: selftests/bpf: Add test for bpf_override_return helper We do not actually test the bpf_override_return helper functionality itself at the moment, only the bpf program being able to attach it. Adding test that override prctl syscall return value on top of kprobe and kprobe.multi. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20260112121157.854473-2-jolsa@kernel.org --- .../selftests/bpf/prog_tests/kprobe_multi_test.c | 44 ++++++++++++++++++++++ .../selftests/bpf/progs/kprobe_multi_override.c | 15 ++++++++ tools/testing/selftests/bpf/trace_helpers.h | 12 ++++++ 3 files changed, 71 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 6cfaa978bc9a..9caef222e528 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include "kprobe_multi.skel.h" #include "trace_helpers.h" @@ -540,6 +542,46 @@ cleanup: kprobe_multi_override__destroy(skel); } +static void test_override(void) +{ + struct kprobe_multi_override *skel = NULL; + int err; + + skel = kprobe_multi_override__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + skel->bss->pid = getpid(); + + /* no override */ + err = prctl(0xffff, 0); + ASSERT_EQ(err, -1, "err"); + + /* kprobe.multi override */ + skel->links.test_override = bpf_program__attach_kprobe_multi_opts(skel->progs.test_override, + SYS_PREFIX "sys_prctl", NULL); + if (!ASSERT_OK_PTR(skel->links.test_override, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + + bpf_link__destroy(skel->links.test_override); + skel->links.test_override = NULL; + + /* kprobe override */ + skel->links.test_kprobe_override = bpf_program__attach_kprobe(skel->progs.test_kprobe_override, + false, SYS_PREFIX "sys_prctl"); + if (!ASSERT_OK_PTR(skel->links.test_kprobe_override, "bpf_program__attach_kprobe")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + +cleanup: + kprobe_multi_override__destroy(skel); +} + #ifdef __x86_64__ static void test_attach_write_ctx(void) { @@ -597,6 +639,8 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("override")) + test_override(); if (test__start_subtest("session")) test_session_skel_api(); if (test__start_subtest("session_cookie")) diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c index 28f8487c9059..14f39fa6d515 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c @@ -5,9 +5,24 @@ char _license[] SEC("license") = "GPL"; +int pid = 0; + SEC("kprobe.multi") int test_override(struct pt_regs *ctx) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + bpf_override_return(ctx, 123); + return 0; +} + +SEC("kprobe") +int test_kprobe_override(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + bpf_override_return(ctx, 123); return 0; } diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 9437bdd4afa5..a5576b2dfc26 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -4,6 +4,18 @@ #include +#ifdef __x86_64__ +#define SYS_PREFIX "__x64_" +#elif defined(__s390x__) +#define SYS_PREFIX "__s390x_" +#elif defined(__aarch64__) +#define SYS_PREFIX "__arm64_" +#elif defined(__riscv) +#define SYS_PREFIX "__riscv_" +#else +#define SYS_PREFIX "" +#endif + #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) -- cgit v1.2.3 From a63e5fe0959200afcfefa7640db44c491f102c4c Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 13 Jan 2026 16:08:19 +0100 Subject: vsock/test: Add test for a linear and non-linear skb getting coalesced Loopback transport can mangle data in rx queue when a linear skb is followed by a small MSG_ZEROCOPY packet. To exercise the logic, send out two packets: a weirdly sized one (to ensure some spare tail room in the skb) and a zerocopy one that's small enough to fit in the spare room of its predecessor. Then, wait for both to land in the rx queue, and check the data received. Faulty packets merger manifests itself by corrupting payload of the later packet. Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20260113-vsock-recv-coalescence-v2-2-552b17837cf4@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 5 +++ tools/testing/vsock/vsock_test_zerocopy.c | 74 +++++++++++++++++++++++++++++++ tools/testing/vsock/vsock_test_zerocopy.h | 3 ++ 3 files changed, 82 insertions(+) (limited to 'tools') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index bbe3723babdc..27e39354499a 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -2403,6 +2403,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_accepted_setsockopt_client, .run_server = test_stream_accepted_setsockopt_server, }, + { + .name = "SOCK_STREAM virtio MSG_ZEROCOPY coalescence corruption", + .run_client = test_stream_msgzcopy_mangle_client, + .run_server = test_stream_msgzcopy_mangle_server, + }, {}, }; diff --git a/tools/testing/vsock/vsock_test_zerocopy.c b/tools/testing/vsock/vsock_test_zerocopy.c index 9d9a6cb9614a..a31ddfc1cd0c 100644 --- a/tools/testing/vsock/vsock_test_zerocopy.c +++ b/tools/testing/vsock/vsock_test_zerocopy.c @@ -9,14 +9,18 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include "control.h" +#include "timeout.h" #include "vsock_test_zerocopy.h" #include "msg_zerocopy_common.h" @@ -356,3 +360,73 @@ void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts) control_expectln("DONE"); close(fd); } + +#define GOOD_COPY_LEN 128 /* net/vmw_vsock/virtio_transport_common.c */ + +void test_stream_msgzcopy_mangle_client(const struct test_opts *opts) +{ + char sbuf1[PAGE_SIZE + 1], sbuf2[GOOD_COPY_LEN]; + unsigned long hash; + struct pollfd fds; + int fd, i; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + enable_so_zerocopy_check(fd); + + memset(sbuf1, 'x', sizeof(sbuf1)); + send_buf(fd, sbuf1, sizeof(sbuf1), 0, sizeof(sbuf1)); + + for (i = 0; i < sizeof(sbuf2); i++) + sbuf2[i] = rand() & 0xff; + + send_buf(fd, sbuf2, sizeof(sbuf2), MSG_ZEROCOPY, sizeof(sbuf2)); + + hash = hash_djb2(sbuf2, sizeof(sbuf2)); + control_writeulong(hash); + + fds.fd = fd; + fds.events = 0; + + if (poll(&fds, 1, TIMEOUT * MSEC_PER_SEC) != 1 || + !(fds.revents & POLLERR)) { + perror("poll"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +void test_stream_msgzcopy_mangle_server(const struct test_opts *opts) +{ + unsigned long local_hash, remote_hash; + char rbuf[PAGE_SIZE + 1]; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + /* Wait, don't race the (buggy) skbs coalescence. */ + vsock_ioctl_int(fd, SIOCINQ, PAGE_SIZE + 1 + GOOD_COPY_LEN); + + /* Discard the first packet. */ + recv_buf(fd, rbuf, PAGE_SIZE + 1, 0, PAGE_SIZE + 1); + + recv_buf(fd, rbuf, GOOD_COPY_LEN, 0, GOOD_COPY_LEN); + remote_hash = control_readulong(); + local_hash = hash_djb2(rbuf, GOOD_COPY_LEN); + + if (local_hash != remote_hash) { + fprintf(stderr, "Data received corrupted\n"); + exit(EXIT_FAILURE); + } + + close(fd); +} diff --git a/tools/testing/vsock/vsock_test_zerocopy.h b/tools/testing/vsock/vsock_test_zerocopy.h index 3ef2579e024d..d46c91a69f16 100644 --- a/tools/testing/vsock/vsock_test_zerocopy.h +++ b/tools/testing/vsock/vsock_test_zerocopy.h @@ -12,4 +12,7 @@ void test_seqpacket_msgzcopy_server(const struct test_opts *opts); void test_stream_msgzcopy_empty_errq_client(const struct test_opts *opts); void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts); +void test_stream_msgzcopy_mangle_client(const struct test_opts *opts); +void test_stream_msgzcopy_mangle_server(const struct test_opts *opts); + #endif /* VSOCK_TEST_ZEROCOPY_H */ -- cgit v1.2.3 From 4f5f148dd7c0459229d2ab9a769b2e820f9ee6a2 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 13 Jan 2026 12:37:44 -0300 Subject: selftests: net: fib-onlink-tests: Convert to use namespaces by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the test breaks if the SUT already has a default route configured for IPv6. Fix by avoiding the use of the default namespace. Fixes: 4ed591c8ab44 ("net/ipv6: Allow onlink routes to have a device mismatch if it is the default route") Suggested-by: Fernando Fernandez Mancera Signed-off-by: Ricardo B. Marlière Reviewed-by: Ido Schimmel Reviewed-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260113-selftests-net-fib-onlink-v2-1-89de2b931389@suse.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 71 +++++++++++-------------- 1 file changed, 30 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index ec2d6ceb1f08..c01be076b210 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -120,7 +120,7 @@ log_subsection() run_cmd() { - local cmd="$*" + local cmd="$1" local out local rc @@ -145,7 +145,7 @@ get_linklocal() local pfx local addr - addr=$(${pfx} ip -6 -br addr show dev ${dev} | \ + addr=$(${pfx} ${IP} -6 -br addr show dev ${dev} | \ awk '{ for (i = 3; i <= NF; ++i) { if ($i ~ /^fe80/) @@ -173,58 +173,48 @@ setup() set -e - # create namespace - setup_ns PEER_NS + # create namespaces + setup_ns ns1 + IP="ip -netns $ns1" + setup_ns ns2 # add vrf table - ip li add ${VRF} type vrf table ${VRF_TABLE} - ip li set ${VRF} up - ip ro add table ${VRF_TABLE} unreachable default metric 8192 - ip -6 ro add table ${VRF_TABLE} unreachable default metric 8192 + ${IP} li add ${VRF} type vrf table ${VRF_TABLE} + ${IP} li set ${VRF} up + ${IP} ro add table ${VRF_TABLE} unreachable default metric 8192 + ${IP} -6 ro add table ${VRF_TABLE} unreachable default metric 8192 # create test interfaces - ip li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]} - ip li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]} - ip li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]} - ip li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]} + ${IP} li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]} + ${IP} li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]} + ${IP} li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]} + ${IP} li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]} # enslave vrf interfaces for n in 5 7; do - ip li set ${NETIFS[p${n}]} vrf ${VRF} + ${IP} li set ${NETIFS[p${n}]} vrf ${VRF} done # add addresses for n in 1 3 5 7; do - ip li set ${NETIFS[p${n}]} up - ip addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} - ip addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad + ${IP} li set ${NETIFS[p${n}]} up + ${IP} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} + ${IP} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad done # move peer interfaces to namespace and add addresses for n in 2 4 6 8; do - ip li set ${NETIFS[p${n}]} netns ${PEER_NS} up - ip -netns ${PEER_NS} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} - ip -netns ${PEER_NS} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad + ${IP} li set ${NETIFS[p${n}]} netns ${ns2} up + ip -netns $ns2 addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} + ip -netns $ns2 addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad done - ip -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64} - ip -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64} + ${IP} -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64} + ${IP} -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64} set +e } -cleanup() -{ - # make sure we start from a clean slate - cleanup_ns ${PEER_NS} 2>/dev/null - for n in 1 3 5 7; do - ip link del ${NETIFS[p${n}]} 2>/dev/null - done - ip link del ${VRF} 2>/dev/null - ip ro flush table ${VRF_TABLE} - ip -6 ro flush table ${VRF_TABLE} -} - ################################################################################ # IPv4 tests # @@ -241,7 +231,7 @@ run_ip() # dev arg may be empty [ -n "${dev}" ] && dev="dev ${dev}" - run_cmd ip ro add table "${table}" "${prefix}"/32 via "${gw}" "${dev}" onlink + run_cmd "${IP} ro add table ${table} ${prefix}/32 via ${gw} ${dev} onlink" log_test $? ${exp_rc} "${desc}" } @@ -257,8 +247,8 @@ run_ip_mpath() # dev arg may be empty [ -n "${dev}" ] && dev="dev ${dev}" - run_cmd ip ro add table "${table}" "${prefix}"/32 \ - nexthop via ${nh1} nexthop via ${nh2} + run_cmd "${IP} ro add table ${table} ${prefix}/32 \ + nexthop via ${nh1} nexthop via ${nh2}" log_test $? ${exp_rc} "${desc}" } @@ -339,7 +329,7 @@ run_ip6() # dev arg may be empty [ -n "${dev}" ] && dev="dev ${dev}" - run_cmd ip -6 ro add table "${table}" "${prefix}"/128 via "${gw}" "${dev}" onlink + run_cmd "${IP} -6 ro add table ${table} ${prefix}/128 via ${gw} ${dev} onlink" log_test $? ${exp_rc} "${desc}" } @@ -353,8 +343,8 @@ run_ip6_mpath() local exp_rc="$6" local desc="$7" - run_cmd ip -6 ro add table "${table}" "${prefix}"/128 "${opts}" \ - nexthop via ${nh1} nexthop via ${nh2} + run_cmd "${IP} -6 ro add table ${table} ${prefix}/128 ${opts} \ + nexthop via ${nh1} nexthop via ${nh2}" log_test $? ${exp_rc} "${desc}" } @@ -491,10 +481,9 @@ do esac done -cleanup setup run_onlink_tests -cleanup +cleanup_ns ${ns1} ${ns2} if [ "$TESTS" != "none" ]; then printf "\nTests passed: %3d\n" ${nsuccess} -- cgit v1.2.3 From a91cc48246605af9aeef1edd32232976d74d9502 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:21:54 -0800 Subject: KVM: selftests: Test READ=>WRITE dirty logging behavior for shadow MMU Update the nested dirty log test to validate KVM's handling of READ faults when dirty logging is enabled. Specifically, set the Dirty bit in the guest PTEs used to map L2 GPAs, so that KVM will create writable SPTEs when handling L2 read faults. When handling read faults in the shadow MMU, KVM opportunistically creates a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept writes in order to emulate Dirty-bit updates. To actually test the L2 READ=>WRITE sequence, e.g. without masking a false pass by other test activity, route the READ=>WRITE and WRITE=>WRITE sequences to separate L1 pages, and differentiate between "marked dirty due to a WRITE access/fault" and "marked dirty due to creating a writable SPTE for a READ access/fault". The updated sequence exposes the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration with PML") when the guest performs a READ=>WRITE sequence with dirty guest PTEs. Opportunistically tweak and rename the address macros, and add comments, to make it more obvious what the test is doing. E.g. NESTED_TEST_MEM1 vs. GUEST_TEST_MEM doesn't make it all that obvious that the test is creating aliases in both the L2 GPA and GVA address spaces, but only when L1 is using TDP to run L2. Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20260115172154.709024-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 1 + tools/testing/selftests/kvm/lib/x86/processor.c | 7 + .../selftests/kvm/x86/nested_dirty_log_test.c | 187 +++++++++++++++------ 3 files changed, 143 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 6bfffc3b0a33..4ebae4269e68 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1486,6 +1486,7 @@ bool kvm_cpu_has_tdp(void); void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa); /* * Basic CPU control in CR0 diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index ab869a98bbdc..fab18e9be66c 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -390,6 +390,13 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa) +{ + int level = PG_LEVEL_4K; + + return __vm_get_page_table_entry(vm, &vm->stage2_mmu, l2_gpa, &level); +} + uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c index 89d2e86a0db9..619229bbd693 100644 --- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -17,29 +17,54 @@ /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 -#define TEST_MEM_PAGES 3 -/* L1 guest test virtual memory offset */ -#define GUEST_TEST_MEM 0xc0000000 +/* + * Allocate four pages total. Two pages are used to verify that the KVM marks + * the accessed page/GFN as marked dirty, but not the "other" page. Times two + * so that each "normal" page can be accessed from L2 via an aliased L2 GVA+GPA + * (when TDP is enabled), to verify KVM marks _L1's_ page/GFN as dirty (to + * detect failures, L2 => L1 GPAs can't be identity mapped in the TDP page + * tables, as marking L2's GPA dirty would get a false pass if L1 == L2). + */ +#define TEST_MEM_PAGES 4 + +#define TEST_MEM_BASE 0xc0000000 +#define TEST_MEM_ALIAS_BASE 0xc0002000 + +#define TEST_GUEST_ADDR(base, idx) ((base) + (idx) * PAGE_SIZE) -/* L2 guest test virtual memory offset */ -#define NESTED_TEST_MEM1 0xc0001000 -#define NESTED_TEST_MEM2 0xc0002000 +#define TEST_GVA(idx) TEST_GUEST_ADDR(TEST_MEM_BASE, idx) +#define TEST_GPA(idx) TEST_GUEST_ADDR(TEST_MEM_BASE, idx) + +#define TEST_ALIAS_GPA(idx) TEST_GUEST_ADDR(TEST_MEM_ALIAS_BASE, idx) + +#define TEST_HVA(vm, idx) addr_gpa2hva(vm, TEST_GPA(idx)) #define L2_GUEST_STACK_SIZE 64 -static void l2_guest_code(u64 *a, u64 *b) -{ - READ_ONCE(*a); - WRITE_ONCE(*a, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); +/* Use the page offset bits to communicate the access+fault type. */ +#define TEST_SYNC_READ_FAULT BIT(0) +#define TEST_SYNC_WRITE_FAULT BIT(1) +#define TEST_SYNC_NO_FAULT BIT(2) - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); +static void l2_guest_code(vm_vaddr_t base) +{ + vm_vaddr_t page0 = TEST_GUEST_ADDR(base, 0); + vm_vaddr_t page1 = TEST_GUEST_ADDR(base, 1); + + READ_ONCE(*(u64 *)page0); + GUEST_SYNC(page0 | TEST_SYNC_READ_FAULT); + WRITE_ONCE(*(u64 *)page0, 1); + GUEST_SYNC(page0 | TEST_SYNC_WRITE_FAULT); + READ_ONCE(*(u64 *)page0); + GUEST_SYNC(page0 | TEST_SYNC_NO_FAULT); + + WRITE_ONCE(*(u64 *)page1, 1); + GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT); + WRITE_ONCE(*(u64 *)page1, 1); + GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT); + READ_ONCE(*(u64 *)page1); + GUEST_SYNC(page1 | TEST_SYNC_NO_FAULT); /* Exit to L1 and never come back. */ vmcall(); @@ -47,13 +72,22 @@ static void l2_guest_code(u64 *a, u64 *b) static void l2_guest_code_tdp_enabled(void) { - l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); + /* + * Use the aliased virtual addresses when running with TDP to verify + * that KVM correctly handles the case where a page is dirtied via a + * different GPA than would be used by L1. + */ + l2_guest_code(TEST_MEM_ALIAS_BASE); } static void l2_guest_code_tdp_disabled(void) { - /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */ - l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); + /* + * Use the "normal" virtual addresses when running without TDP enabled, + * in which case L2 will use the same page tables as L1, and thus needs + * to use the same virtual addresses that are mapped into L1. + */ + l2_guest_code(TEST_MEM_BASE); } void l1_vmx_code(struct vmx_pages *vmx) @@ -72,9 +106,9 @@ void l1_vmx_code(struct vmx_pages *vmx) prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT(!vmlaunch()); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); GUEST_DONE(); } @@ -91,9 +125,9 @@ static void l1_svm_code(struct svm_test_data *svm) generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); run_guest(svm->vmcb, svm->vmcb_gpa); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); GUEST_DONE(); } @@ -106,12 +140,66 @@ static void l1_guest_code(void *data) l1_svm_code(data); } +static void test_handle_ucall_sync(struct kvm_vm *vm, u64 arg, + unsigned long *bmap) +{ + vm_vaddr_t gva = arg & ~(PAGE_SIZE - 1); + int page_nr, i; + + /* + * Extract the page number of underlying physical page, which is also + * the _L1_ page number. The dirty bitmap _must_ be updated based on + * the L1 GPA, not L2 GPA, i.e. whether or not L2 used an aliased GPA + * (i.e. if TDP enabled for L2) is irrelevant with respect to the dirty + * bitmap and which underlying physical page is accessed. + * + * Note, gva will be '0' if there was no access, i.e. if the purpose of + * the sync is to verify all pages are clean. + */ + if (!gva) + page_nr = 0; + else if (gva >= TEST_MEM_ALIAS_BASE) + page_nr = (gva - TEST_MEM_ALIAS_BASE) >> PAGE_SHIFT; + else + page_nr = (gva - TEST_MEM_BASE) >> PAGE_SHIFT; + TEST_ASSERT(page_nr == 0 || page_nr == 1, + "Test bug, unexpected frame number '%u' for arg = %lx", page_nr, arg); + TEST_ASSERT(gva || (arg & TEST_SYNC_NO_FAULT), + "Test bug, gva must be valid if a fault is expected"); + + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + + /* + * Check all pages to verify the correct physical page was modified (or + * not), and that all pages are clean/dirty as expected. + * + * If a fault of any kind is expected, the target page should be dirty + * as the Dirty bit is set in the gPTE. KVM should create a writable + * SPTE even on a read fault, *and* KVM must mark the GFN as dirty + * when doing so. + */ + for (i = 0; i < TEST_MEM_PAGES; i++) { + if (i == page_nr && (arg & TEST_SYNC_WRITE_FAULT)) + TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 1, + "Page %u incorrectly not written by guest", i); + else + TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 0xaaaaaaaaaaaaaaaaULL, + "Page %u incorrectly written by guest", i); + + if (i == page_nr && !(arg & TEST_SYNC_NO_FAULT)) + TEST_ASSERT(test_bit(i, bmap), + "Page %u incorrectly reported clean on %s fault", + i, arg & TEST_SYNC_READ_FAULT ? "read" : "write"); + else + TEST_ASSERT(!test_bit(i, bmap), + "Page %u incorrectly reported dirty", i); + } +} + static void test_dirty_log(bool nested_tdp) { vm_vaddr_t nested_gva = 0; unsigned long *bmap; - uint64_t *host_test_mem; - struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; @@ -133,35 +221,46 @@ static void test_dirty_log(bool nested_tdp) /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - GUEST_TEST_MEM, + TEST_MEM_BASE, TEST_MEM_SLOT_INDEX, TEST_MEM_PAGES, KVM_MEM_LOG_DIRTY_PAGES); /* - * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * Add an identity map for GVA range [0xc0000000, 0xc0004000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); + virt_map(vm, TEST_MEM_BASE, TEST_MEM_BASE, TEST_MEM_PAGES); /* - * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to - * 0xc0000000. + * ... pages in the L2 GPA address range [0xc0002000, 0xc0004000) will + * map to [0xc0000000, 0xc0002000) when TDP is enabled (for L2). * * When TDP is disabled, the L2 guest code will still access the same L1 * GPAs as the TDP enabled case. + * + * Set the Dirty bit in the PTEs used by L2 so that KVM will create + * writable SPTEs when handling read faults (if the Dirty bit isn't + * set, KVM must intercept the next write to emulate the Dirty bit + * update). */ if (nested_tdp) { tdp_identity_map_default_memslots(vm); - tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, TEST_ALIAS_GPA(0), TEST_GPA(0), PAGE_SIZE); + tdp_map(vm, TEST_ALIAS_GPA(1), TEST_GPA(1), PAGE_SIZE); + + *tdp_get_pte(vm, TEST_ALIAS_GPA(0)) |= PTE_DIRTY_MASK(&vm->stage2_mmu); + *tdp_get_pte(vm, TEST_ALIAS_GPA(1)) |= PTE_DIRTY_MASK(&vm->stage2_mmu); + } else { + *vm_get_pte(vm, TEST_GVA(0)) |= PTE_DIRTY_MASK(&vm->mmu); + *vm_get_pte(vm, TEST_GVA(1)) |= PTE_DIRTY_MASK(&vm->mmu); } bmap = bitmap_zalloc(TEST_MEM_PAGES); - host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + memset(TEST_HVA(vm, 0), 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -170,23 +269,7 @@ static void test_dirty_log(bool nested_tdp) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: - /* - * The nested guest wrote at offset 0x1000 in the memslot, but the - * dirty bitmap must be filled in according to L1 GPA, not L2. - */ - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); - if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); - } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); - } - - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + test_handle_ucall_sync(vm, uc.args[1], bmap); break; case UCALL_DONE: done = true; -- cgit v1.2.3 From 086c99fbe45070d02851427eab5ae26fe7d0f3c0 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 15 Jan 2026 07:11:41 -0800 Subject: selftests: bpf: Add test for multiple syncs from linked register Before the last commit, sync_linked_regs() corrupted the register whose bounds are being updated by copying known_reg's id to it. The ids are the same in value but known_reg has the BPF_ADD_CONST flag which is wrongly copied to reg. This later causes issues when creating new links to this reg. assign_scalar_id_before_mov() sees this BPF_ADD_CONST and gives a new id to this register and breaks the old links. This is exposed by the added selftest. Signed-off-by: Puranjay Mohan Tested-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260115151143.1344724-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_linked_scalars.c | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c index 8f755d2464cf..5f41bbb730a7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c +++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c @@ -31,4 +31,37 @@ l1: \ " ::: __clobber_all); } +/* + * Test that sync_linked_regs() preserves register IDs. + * + * The sync_linked_regs() function copies bounds from known_reg to linked + * registers. When doing so, it must preserve each register's original id + * to allow subsequent syncs from the same source to work correctly. + * + */ +SEC("socket") +__success +__naked void sync_linked_regs_preserves_id(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; /* r0 in [0, 255] */ \ + r1 = r0; /* r0, r1 linked with id 1 */ \ + r1 += 4; /* r1 has id=1 and off=4 in [4, 259] */ \ + if r1 < 10 goto l0_%=; \ + /* r1 in [10, 259], r0 synced to [6, 255] */ \ + r2 = r0; /* r2 has id=1 and in [6, 255] */ \ + if r1 < 14 goto l0_%=; \ + /* r1 in [14, 259], r0 synced to [10, 255] */ \ + if r0 >= 10 goto l0_%=; \ + /* Never executed */ \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b8f7622aa6e32d6fd750697b99d8ce19ad8e66d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Dec 2025 14:03:25 +0100 Subject: selftests/open_tree: add OPEN_TREE_NAMESPACE tests Add tests for OPEN_TREE_NAMESPACE. Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-2-bfb24c7b061f@kernel.org Tested-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- .../selftests/filesystems/open_tree_ns/.gitignore | 1 + .../selftests/filesystems/open_tree_ns/Makefile | 10 + .../filesystems/open_tree_ns/open_tree_ns_test.c | 1030 ++++++++++++++++++++ tools/testing/selftests/filesystems/utils.c | 26 + tools/testing/selftests/filesystems/utils.h | 1 + 5 files changed, 1068 insertions(+) create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/.gitignore create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/Makefile create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/open_tree_ns/.gitignore b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore new file mode 100644 index 000000000000..fb12b93fbcaa --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore @@ -0,0 +1 @@ +open_tree_ns_test diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile new file mode 100644 index 000000000000..73c03c4a7ef6 --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := open_tree_ns_test + +CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES) +LDLIBS := -lcap + +include ../../lib.mk + +$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c new file mode 100644 index 000000000000..9711556280ae --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c @@ -0,0 +1,1030 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for OPEN_TREE_NAMESPACE flag. + * + * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount + * namespace containing the specified mount tree. + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../wrappers.h" +#include "../statmount/statmount.h" +#include "../utils.h" +#include "../../kselftest_harness.h" + +#ifndef OPEN_TREE_NAMESPACE +#define OPEN_TREE_NAMESPACE (1 << 1) +#endif + +static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id) +{ + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) + return -errno; + return 0; +} + +static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + ret = get_mnt_ns_id(fd, mnt_ns_id); + close(fd); + return ret; +} + +#define STATMOUNT_BUFSIZE (1 << 15) + +static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask) +{ + struct statmount *buf; + size_t bufsize = STATMOUNT_BUFSIZE; + int ret; + + for (;;) { + buf = malloc(bufsize); + if (!buf) + return NULL; + + ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0); + if (ret == 0) + return buf; + + free(buf); + if (errno != EOVERFLOW) + return NULL; + + bufsize <<= 1; + } +} + +static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) +{ + const char *fs_type = ""; + const char *mnt_root = ""; + const char *mnt_point = ""; + + if (sm->mask & STATMOUNT_FS_TYPE) + fs_type = sm->str + sm->fs_type; + if (sm->mask & STATMOUNT_MNT_ROOT) + mnt_root = sm->str + sm->mnt_root; + if (sm->mask & STATMOUNT_MNT_POINT) + mnt_point = sm->str + sm->mnt_point; + + TH_LOG(" mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s", + (unsigned long long)sm->mnt_id, + (unsigned long long)sm->mnt_parent_id, + fs_type, mnt_root, mnt_point); +} + +static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) +{ + uint64_t list[256]; + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) { + TH_LOG("listmount failed: %s", strerror(errno)); + return; + } + + TH_LOG("Mount namespace %llu contains %zd mount(s):", + (unsigned long long)mnt_ns_id, nr_mounts); + + for (ssize_t i = 0; i < nr_mounts; i++) { + struct statmount *sm; + + sm = statmount_alloc(list[i], mnt_ns_id, + STATMOUNT_MNT_BASIC | + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT); + if (!sm) { + TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", + i, (unsigned long long)list[i], strerror(errno)); + continue; + } + + log_mount(_metadata, sm); + free(sm); + } +} + +FIXTURE(open_tree_ns) +{ + int fd; + uint64_t current_ns_id; +}; + +FIXTURE_VARIANT(open_tree_ns) +{ + const char *path; + unsigned int flags; + bool expect_success; + bool expect_different_ns; + int min_mounts; +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, basic_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + /* + * The empty rootfs is hidden from listmount()/mountinfo, + * so we only see the bind mount on top of it. + */ + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_proc) +{ + .path = "/proc", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_run) +{ + .path = "/run", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, invalid_recursive_alone) +{ + .path = "/", + .flags = AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = false, + .expect_different_ns = false, + .min_mounts = 0, +}; + +FIXTURE_SETUP(open_tree_ns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); + + /* Get current mount namespace ID for comparison */ + ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id); + if (ret < 0) + SKIP(return, "Failed to get current mount namespace ID"); +} + +FIXTURE_TEARDOWN(open_tree_ns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns, create_namespace) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + + if (!variant->expect_success) { + ASSERT_LT(self->fd, 0); + ASSERT_EQ(errno, EINVAL); + return; + } + + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Verify we can get the namespace ID */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + /* Verify it's a different namespace */ + if (variant->expect_different_ns) + ASSERT_NE(new_ns_id, self->current_ns_id); + + /* List mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("%m - listmount failed"); + } + + /* Verify minimum expected mounts */ + ASSERT_GE(nr_mounts, variant->min_mounts); + TH_LOG("Namespace contains %zd mounts", nr_mounts); +} + +TEST_F(open_tree_ns, setns_into_namespace) +{ + uint64_t new_ns_id; + pid_t pid; + int status; + int ret; + + /* Only test with basic flags */ + if (!(variant->flags & OPEN_TREE_NAMESPACE)) + SKIP(return, "setns test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Get namespace ID and dump all mounts */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + dump_mounts(_metadata, new_ns_id); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: try to enter the namespace */ + if (setns(self->fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(open_tree_ns, verify_mount_properties) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + /* Only test with basic flags on root */ + if (variant->flags != (OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC) || + strcmp(variant->path, "/") != 0) + SKIP(return, "mount properties test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, "/", OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + /* Get info about the root mount (the bind mount, rootfs is hidden) */ + ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + ASSERT_NE(sm.mnt_id, sm.mnt_parent_id); + + TH_LOG("Root mount id: %llu, parent: %llu", + (unsigned long long)sm.mnt_id, + (unsigned long long)sm.mnt_parent_id); +} + +FIXTURE(open_tree_ns_caps) +{ + bool has_caps; +}; + +FIXTURE_SETUP(open_tree_ns_caps) +{ + int ret; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + self->has_caps = (geteuid() == 0); +} + +FIXTURE_TEARDOWN(open_tree_ns_caps) +{ +} + +TEST_F(open_tree_ns_caps, requires_cap_sys_admin) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + + /* Child: drop privileges using utils.h helper */ + if (enter_userns() != 0) + _exit(2); + + /* Drop all caps using utils.h helper */ + if (caps_down() == 0) + _exit(3); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd >= 0) { + close(fd); + /* Should have failed without caps */ + _exit(1); + } + + if (errno == EPERM) + _exit(0); + + /* EINVAL means OPEN_TREE_NAMESPACE not supported */ + if (errno == EINVAL) + _exit(4); + + /* Unexpected error */ + _exit(5); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Expected: EPERM without caps */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("OPEN_TREE_NAMESPACE succeeded without caps"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "caps_down failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_userns) +{ + int fd; +}; + +FIXTURE_SETUP(open_tree_ns_userns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); +} + +FIXTURE_TEARDOWN(open_tree_ns_userns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns_userns, create_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace (also creates mount namespace) */ + if (enter_userns() != 0) + _exit(2); + + /* Now we have CAP_SYS_ADMIN in the user namespace */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); /* OPEN_TREE_NAMESPACE not supported */ + _exit(1); + } + + /* Verify we can get the namespace ID */ + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Verify we can list mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Should have at least 1 mount */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, setns_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + int fd; + pid_t inner_pid; + int inner_status; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Fork again to test setns into the new namespace */ + inner_pid = fork(); + if (inner_pid < 0) + _exit(8); + + if (inner_pid == 0) { + /* Inner child: enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + if (waitpid(inner_pid, &inner_status, 0) != inner_pid) + _exit(9); + + if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0) + _exit(10); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree or setns failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("Inner fork failed"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("Inner waitpid failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, recursive_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + /* Test recursive flag in userns */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Recursive should copy submounts too */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE|AT_RECURSIVE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_fails_einval) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) == 0) { + free(sm); + _exit(7); + } + + if (errno != EINVAL) { + /* Wrong error */ + free(sm); + _exit(8); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_succeeds) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + if (unshare(CLONE_NEWNS)) + _exit(1); + + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0) + _exit(1); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) != 0) { + free(sm); + _exit(7); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_unbindable) +{ + char tmpdir[PATH_MAX]; + bool mounted; +}; + +FIXTURE_SETUP(open_tree_ns_unbindable) +{ + int ret; + + self->mounted = false; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Create a temporary directory for the test mount */ + snprintf(self->tmpdir, sizeof(self->tmpdir), + "/tmp/open_tree_ns_test.XXXXXX"); + ASSERT_NE(mkdtemp(self->tmpdir), NULL); + + /* Mount tmpfs there */ + ret = mount("tmpfs", self->tmpdir, "tmpfs", 0, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to mount tmpfs"); + } + self->mounted = true; + + ret = mount(NULL, self->tmpdir, NULL, MS_UNBINDABLE, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to make tmpfs unbindable"); + } +} + +FIXTURE_TEARDOWN(open_tree_ns_unbindable) +{ + if (self->mounted) + umount2(self->tmpdir, MNT_DETACH); + rmdir(self->tmpdir); +} + +TEST_F(open_tree_ns_unbindable, fails_on_unbindable) +{ + int fd; + + fd = sys_open_tree(AT_FDCWD, self->tmpdir, + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + ASSERT_LT(fd, 0); +} + +TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + bool found_unbindable = false; + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + ASSERT_GT(fd, 0); + + ASSERT_EQ(get_mnt_ns_id(fd, &new_ns_id), 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("listmount failed: %m"); + } + + /* + * Iterate through all mounts in the new namespace and verify + * the unbindable tmpfs mount was silently dropped. + */ + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT); + ASSERT_NE(sm, NULL) { + TH_LOG("statmount_alloc failed for mnt_id %llu", + (unsigned long long)list[i]); + } + + mnt_point = sm->str + sm->mnt_point; + + if (strcmp(mnt_point, self->tmpdir) == 0) { + TH_LOG("Found unbindable mount at %s (should have been dropped)", + mnt_point); + found_unbindable = true; + } + + free(sm); + } + + ASSERT_FALSE(found_unbindable) { + TH_LOG("Unbindable mount at %s was not dropped", self->tmpdir); + } + + close(fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c9dd5412b37b..d6f26f849053 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -515,6 +515,32 @@ int setup_userns(void) return 0; } +int enter_userns(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWUSER); + if (ret) + return ret; + + sprintf(buf, "0 %d 1", uid); + ret = write_file("/proc/self/uid_map", buf); + if (ret) + return ret; + ret = write_file("/proc/self/setgroups", "deny"); + if (ret) + return ret; + sprintf(buf, "0 %d 1", gid); + ret = write_file("/proc/self/gid_map", buf); + if (ret) + return ret; + + return 0; +} + /* caps_down - lower all effective caps */ int caps_down(void) { diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h index 70f7ccc607f4..0bccfed666a9 100644 --- a/tools/testing/selftests/filesystems/utils.h +++ b/tools/testing/selftests/filesystems/utils.h @@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down); extern bool switch_ids(uid_t uid, gid_t gid); extern int setup_userns(void); +extern int enter_userns(void); static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) { -- cgit v1.2.3 From 7799ba2160e4919913ecabca8a7fc1aa4c576fb4 Mon Sep 17 00:00:00 2001 From: João Marcos Costa Date: Tue, 13 Jan 2026 14:27:53 +0100 Subject: cpupower: make systemd unit installation optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpupower currently installs a cpupower.service unit file into unitdir unconditionally, regardless of whether systemd is used by the host. Improve the installation procedure by making this systemd step optional: a 'SYSTEMD' build parameter that defaults to 'true' and can be set to 'false' to disable the installation of systemd's unit file. Since 'SYSTEMD' defaults to true, the current behavior is kept as the default. Link: https://lore.kernel.org/r/20260113132753.1730020-2-joaomarcos.costa@bootlin.com Signed-off-by: João Marcos Costa Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index a1df9196dc45..969716dfe8de 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -315,7 +315,17 @@ endif $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h $(INSTALL_DATA) lib/powercap.h $(DESTDIR)${includedir}/powercap.h -install-tools: $(OUTPUT)cpupower +# SYSTEMD=false disables installation of the systemd unit file +SYSTEMD ?= true + +install-systemd: + $(INSTALL) -d $(DESTDIR)${unitdir} + sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service' + $(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service' + +INSTALL_SYSTEMD := $(if $(filter true,$(strip $(SYSTEMD))),install-systemd) + +install-tools: $(OUTPUT)cpupower $(INSTALL_SYSTEMD) $(INSTALL) -d $(DESTDIR)${bindir} $(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir} $(INSTALL) -d $(DESTDIR)${bash_completion_dir} @@ -324,9 +334,6 @@ install-tools: $(OUTPUT)cpupower $(INSTALL_DATA) cpupower-service.conf '$(DESTDIR)${confdir}' $(INSTALL) -d $(DESTDIR)${libexecdir} $(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower' - $(INSTALL) -d $(DESTDIR)${unitdir} - sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service' - $(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service' install-man: $(INSTALL_DATA) -D man/cpupower.1 $(DESTDIR)${mandir}/man1/cpupower.1 @@ -406,4 +413,4 @@ help: @echo ' uninstall - Remove previously installed files from the dir defined by "DESTDIR"' @echo ' cmdline or Makefile config block option (default: "")' -.PHONY: all utils libcpupower update-po create-gmo install-lib install-tools install-man install-gmo install uninstall clean help +.PHONY: all utils libcpupower update-po create-gmo install-lib install-systemd install-tools install-man install-gmo install uninstall clean help -- cgit v1.2.3 From 3ec6cefc398b93e5f28500f80e7321a80fffee8a Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Fri, 16 Jan 2026 11:20:54 -0300 Subject: selftests/run_kselftest.sh: Add `--skip` argument option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the only way of excluding certain tests from a collection is by passing all the other tests explicitly via `--test`. Therefore, if the user wants to skip a single test the resulting command line might be too big, depending on the collection. Add an option `--skip` that takes care of that. Link: https://lore.kernel.org/r/20260116-selftests-add_skip_opt-v1-1-ab54afaae81b@suse.com Signed-off-by: Ricardo B. Marlière Signed-off-by: Shuah Khan --- tools/testing/selftests/run_kselftest.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh index d4be97498b32..84d45254675c 100755 --- a/tools/testing/selftests/run_kselftest.sh +++ b/tools/testing/selftests/run_kselftest.sh @@ -30,6 +30,7 @@ Usage: $0 [OPTIONS] -s | --summary Print summary with detailed log in output.log (conflict with -p) -p | --per-test-log Print test log in /tmp with each test name (conflict with -s) -t | --test COLLECTION:TEST Run TEST from COLLECTION + -S | --skip COLLECTION:TEST Skip TEST from COLLECTION -c | --collection COLLECTION Run all tests from COLLECTION -l | --list List the available collection:test entries -d | --dry-run Don't actually run any tests @@ -43,6 +44,7 @@ EOF COLLECTIONS="" TESTS="" +SKIP="" dryrun="" kselftest_override_timeout="" ERROR_ON_FAIL=true @@ -58,6 +60,9 @@ while true; do -t | --test) TESTS="$TESTS $2" shift 2 ;; + -S | --skip) + SKIP="$SKIP $2" + shift 2 ;; -c | --collection) COLLECTIONS="$COLLECTIONS $2" shift 2 ;; @@ -109,6 +114,12 @@ if [ -n "$TESTS" ]; then done available="$(echo "$valid" | sed -e 's/ /\n/g')" fi +# Remove tests to be skipped from available list +if [ -n "$SKIP" ]; then + for skipped in $SKIP ; do + available="$(echo "$available" | grep -v "^${skipped}$")" + done +fi kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)" export kselftest_failures_file -- cgit v1.2.3 From db7855c96d4216b2ed45e2781fae9293b323c7ef Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 16 Jan 2026 12:40:56 -0800 Subject: x86/entry/vdso/selftest: Update location of vgetrandom-chacha.S As part of the vdso build restructuring, vgetrandom-chacha.S moved into the vdso/vdso64 subdirectory. Update the selftest #include to match. Closes: https://lore.kernel.org/oe-lkp/202601161608.5cd5af9a-lkp@intel.com Fixes: 693c819fedcd ("x86/entry/vdso: Refactor the vdso build") Reported-by: kernel test robot Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20260116204057.386268-4-hpa@zytor.com --- tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S index a4a82e1c28a9..10f982157a1f 100644 --- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S +++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S @@ -16,5 +16,5 @@ #elif defined(__s390x__) #include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S" #elif defined(__x86_64__) -#include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S" +#include "../../../../arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S" #endif -- cgit v1.2.3 From 999b2395e3c32273dec98f811f0ab5c8a7441850 Mon Sep 17 00:00:00 2001 From: Gyutae Bae Date: Mon, 12 Jan 2026 12:45:16 +0900 Subject: bpftool: Add 'prepend' option for tcx attach to insert at chain start Add support for the 'prepend' option when attaching tcx_ingress and tcx_egress programs. This option allows inserting a BPF program at the beginning of the TCX chain instead of appending it at the end. The implementation uses BPF_F_BEFORE flag which automatically inserts the program at the beginning of the chain when no relative reference is specified. This change includes: - Modify do_attach_tcx() to support prepend insertion using BPF_F_BEFORE - Update documentation to describe the new 'prepend' option - Add bash completion support for the 'prepend' option on tcx attach types - Add example usage in the documentation - Add validation to reject 'overwrite' for non-XDP attach types The 'prepend' option is only valid for tcx_ingress and tcx_egress attach types. For XDP attach types, the existing 'overwrite' option remains available. Example usage: # bpftool net attach tcx_ingress name tc_prog dev lo prepend This feature is useful when the order of program execution in the TCX chain matters and users need to ensure certain programs run first. Co-developed-by: Siwan Kim Signed-off-by: Siwan Kim Signed-off-by: Gyutae Bae Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20260112034516.22723-1-gyutae.opensource@navercorp.com --- tools/bpf/bpftool/Documentation/bpftool-net.rst | 30 ++++++++++++++++++------ tools/bpf/bpftool/bash-completion/bpftool | 9 ++++++- tools/bpf/bpftool/net.c | 31 +++++++++++++++++++++---- 3 files changed, 58 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index a9ed8992800f..22da07087e42 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -24,7 +24,7 @@ NET COMMANDS ============ | **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** | **prepend** ] | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net help** | @@ -58,11 +58,9 @@ bpftool net { show | list } [ dev *NAME* ] then all bpf programs attached to non clsact qdiscs, and finally all bpf programs attached to root and clsact qdisc. -bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite | prepend ] Attach bpf program *PROG* to network interface *NAME* with type specified - by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the - command used with **overwrite** option. Currently, only XDP-related modes - are supported for *ATTACH_TYPE*. + by *ATTACH_TYPE*. *ATTACH_TYPE* can be of: **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; @@ -72,11 +70,18 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] **tcx_ingress** - Ingress TCX. runs on ingress net traffic; **tcx_egress** - Egress TCX. runs on egress net traffic; + For XDP-related attach types (**xdp**, **xdpgeneric**, **xdpdrv**, + **xdpoffload**), the **overwrite** option can be used to replace a + previously attached bpf program. + + For **tcx_ingress** and **tcx_egress** attach types, the **prepend** option + can be used to attach the program at the beginning of the chain instead of + at the end. + bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used - for attach must be specified. Currently, only XDP-related modes are - supported for *ATTACH_TYPE*. + for attach must be specified. bpftool net help Print short help message. @@ -191,6 +196,17 @@ EXAMPLES tc: lo(1) tcx/ingress tc_prog prog_id 29 +| +| **# bpftool net attach tcx_ingress name tc_prog2 dev lo prepend** +| **# bpftool net** +| + +:: + + tc: + lo(1) tcx/ingress tc_prog2 prog_id 30 + lo(1) tcx/ingress tc_prog prog_id 29 + | | **# bpftool net attach tcx_ingress name tc_prog dev lo** | **# bpftool net detach tcx_ingress dev lo** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 53bcfeb1a76e..a28f0cc522e4 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1142,7 +1142,14 @@ _bpftool() return 0 ;; 8) - _bpftool_once_attr 'overwrite' + case ${words[3]} in + tcx_ingress|tcx_egress) + _bpftool_once_attr 'prepend' + ;; + *) + _bpftool_once_attr 'overwrite' + ;; + esac return 0 ;; esac diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index cfc6f944f7c3..f25d66c8395e 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -666,10 +666,16 @@ static int get_tcx_type(enum net_attach_type attach_type) } } -static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex) +static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex, bool prepend) { int type = get_tcx_type(attach_type); + if (prepend) { + LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = BPF_F_BEFORE + ); + return bpf_prog_attach_opts(progfd, ifindex, type, &opts); + } return bpf_prog_attach(progfd, ifindex, type, 0); } @@ -685,6 +691,7 @@ static int do_attach(int argc, char **argv) enum net_attach_type attach_type; int progfd, ifindex, err = 0; bool overwrite = false; + bool prepend = false; /* parse attach args */ if (!REQ_ARGS(5)) @@ -709,9 +716,25 @@ static int do_attach(int argc, char **argv) if (argc) { if (is_prefix(*argv, "overwrite")) { + if (attach_type != NET_ATTACH_TYPE_XDP && + attach_type != NET_ATTACH_TYPE_XDP_GENERIC && + attach_type != NET_ATTACH_TYPE_XDP_DRIVER && + attach_type != NET_ATTACH_TYPE_XDP_OFFLOAD) { + p_err("'overwrite' is only supported for xdp types"); + err = -EINVAL; + goto cleanup; + } overwrite = true; + } else if (is_prefix(*argv, "prepend")) { + if (attach_type != NET_ATTACH_TYPE_TCX_INGRESS && + attach_type != NET_ATTACH_TYPE_TCX_EGRESS) { + p_err("'prepend' is only supported for tcx_ingress/tcx_egress"); + err = -EINVAL; + goto cleanup; + } + prepend = true; } else { - p_err("expected 'overwrite', got: '%s'?", *argv); + p_err("expected 'overwrite' or 'prepend', got: '%s'?", *argv); err = -EINVAL; goto cleanup; } @@ -728,7 +751,7 @@ static int do_attach(int argc, char **argv) /* attach tcx prog */ case NET_ATTACH_TYPE_TCX_INGRESS: case NET_ATTACH_TYPE_TCX_EGRESS: - err = do_attach_tcx(progfd, attach_type, ifindex); + err = do_attach_tcx(progfd, attach_type, ifindex, prepend); break; default: break; @@ -985,7 +1008,7 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s { show | list } [dev ]\n" - " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite ]\n" + " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite | prepend ]\n" " %1$s %2$s detach ATTACH_TYPE dev \n" " %1$s %2$s help\n" "\n" -- cgit v1.2.3 From 47d440d0a5bb822f3f4e4b2479246da5efb765e6 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 15 Jan 2026 16:34:57 +0000 Subject: selftests/bpf: Support when CONFIG_VXLAN=m If CONFIG_VXLAN is 'm', struct vxlanhdr will not be in vmlinux.h. Add a ___local variant to support cases where vxlan is a module. Fixes: 8517b1abe5ea ("selftests/bpf: Integrate test_tc_tunnel.sh tests into test_progs") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115163457.146267-1-alan.maguire@oracle.com --- tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 7330c61b5730..7376df405a6b 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -23,7 +23,12 @@ static const int cfg_udp_src = 20000; (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) -#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) +struct vxlanhdr___local { + __be32 vx_flags; + __be32 vx_vni; +}; + +#define L2_PAD_SZ (sizeof(struct vxlanhdr___local) + ETH_HLEN) #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 @@ -154,7 +159,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -195,12 +200,12 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -285,7 +290,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -325,12 +330,12 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -639,7 +644,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) olen += ETH_HLEN; break; case VXLAN_UDP_PORT: - olen += ETH_HLEN + sizeof(struct vxlanhdr); + olen += ETH_HLEN + sizeof(struct vxlanhdr___local); break; } break; -- cgit v1.2.3 From efad162f5a840ae178e7761c176c49f433c7bb68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 15 Jan 2026 21:22:45 -0800 Subject: selftests/bpf: Fix map_kptr test failure On my arm64 machine, I get the following failure: ... tester_init:PASS:tester_log_buf 0 nsec process_subtest:PASS:obj_open_mem 0 nsec process_subtest:PASS:specs_alloc 0 nsec serial_test_map_kptr:PASS:rcu_tasks_trace_gp__open_and_load 0 nsec ... test_map_kptr_success:PASS:map_kptr__open_and_load 0 nsec test_map_kptr_success:PASS:test_map_kptr_ref1 refcount 0 nsec test_map_kptr_success:FAIL:test_map_kptr_ref1 retval unexpected error: 2 (errno 2) test_map_kptr_success:PASS:test_map_kptr_ref2 refcount 0 nsec test_map_kptr_success:FAIL:test_map_kptr_ref2 retval unexpected error: 1 (errno 2) ... #201/21 map_kptr/success-map:FAIL In serial_test_map_kptr(), before test_map_kptr_success(), one kern_sync_rcu() is used to have some delay for freeing the map. But in my environment, one kern_sync_rcu() seems not enough and caused the test failure. In bpf_map_free_in_work() in syscall.c, the queue time for queue_work(system_dfl_wq, &map->work) may be longer than expected. This may cause the test failure since test_map_kptr_success() expects all previous maps having been freed. Since it is not clear how long queue_work() time takes, a bpf prog is added to count the reference after bpf_kfunc_call_test_acquire(). If the number of references is 2 (for initial ref and the one just acquired), all previous maps should have been released. This will resolve the above 'retval unexpected error' issue. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260116052245.3692405-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/prog_tests/map_kptr.c | 23 +++++++++++++++++++++++ tools/testing/selftests/bpf/progs/map_kptr.c | 18 ++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 8743df599567..f372162c0280 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -131,6 +131,25 @@ static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu) return 0; } +static void wait_for_map_release(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, lopts); + struct map_kptr *skel; + int ret; + + skel = map_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load")) + return; + + do { + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &lopts); + ASSERT_OK(ret, "count_ref ret"); + ASSERT_OK(lopts.retval, "count_ref retval"); + } while (skel->bss->num_of_refs != 2); + + map_kptr__destroy(skel); +} + void serial_test_map_kptr(void) { struct rcu_tasks_trace_gp *skel; @@ -148,11 +167,15 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on bpf_map_free_deferred */ test_map_kptr_success(false); ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on synchronous delete elem */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index edaba481db9d..e708ffbe1f61 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -487,6 +487,24 @@ int test_map_kptr_ref3(struct __sk_buff *ctx) return 0; } +int num_of_refs; + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + SEC("syscall") int test_ls_map_kptr_ref1(void *ctx) { -- cgit v1.2.3 From 6588b8845e7387438d4b91ea86e7cb6d838b3108 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 16 Jan 2026 15:30:23 -0800 Subject: tools/power/x86/intel-speed-select: Allow non root users When permitted by the file /dev/isst_interface, allow to issue commands for non root users. When user id is non root, check if "/dev/isst_interface" can still be opened. If this file can be opened, allow all read only commands. Signed-off-by: Srinivas Pandruvada --- tools/power/x86/intel-speed-select/isst-config.c | 39 ++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 558138eea75e..807feb17bb81 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -80,6 +80,18 @@ struct cpu_topology { short die_id; }; +static int read_only; + +static void check_privilege(void) +{ + if (!read_only) + return; + + isst_display_error_info_message(1, "Insufficient privileges", 0, 0); + isst_ctdp_display_information_end(outf); + exit(1); +} + FILE *get_output_file(void) { return outf; @@ -1578,6 +1590,8 @@ free_mask: static void set_tdp_level(int arg) { + check_privilege(); + if (cmd_help) { fprintf(stderr, "Set Config TDP level\n"); fprintf(stderr, @@ -2046,6 +2060,8 @@ static void set_pbf_enable(int arg) { int enable = arg; + check_privilege(); + if (cmd_help) { if (enable) { fprintf(stderr, @@ -2212,6 +2228,8 @@ static void set_fact_enable(int arg) int i, ret, enable = arg; struct isst_id id; + check_privilege(); + if (cmd_help) { if (enable) { fprintf(stderr, @@ -2361,6 +2379,8 @@ static void set_clos_enable(int arg) { int enable = arg; + check_privilege(); + if (cmd_help) { if (enable) { fprintf(stderr, @@ -2491,6 +2511,8 @@ static void set_clos_config_for_cpu(struct isst_id *id, void *arg1, void *arg2, static void set_clos_config(int arg) { + check_privilege(); + if (cmd_help) { fprintf(stderr, "Set core-power configuration for one of the four clos ids\n"); @@ -2556,6 +2578,8 @@ static void set_clos_assoc_for_cpu(struct isst_id *id, void *arg1, void *arg2, v static void set_clos_assoc(int arg) { + check_privilege(); + if (cmd_help) { fprintf(stderr, "Associate a clos id to a CPU\n"); fprintf(stderr, @@ -2637,6 +2661,8 @@ static void set_turbo_mode(int arg) int i, disable = arg; struct isst_id id; + check_privilege(); + if (cmd_help) { if (disable) fprintf(stderr, "Set turbo mode disable\n"); @@ -2682,6 +2708,7 @@ static void get_set_trl(struct isst_id *id, void *arg1, void *arg2, void *arg3, } if (set) { + check_privilege(); ret = isst_set_trl(id, fact_trl); isst_display_result(id, outf, "turbo-mode", "set-trl", ret); return; @@ -3204,8 +3231,16 @@ static void cmdline(int argc, char **argv) }; if (geteuid() != 0) { - fprintf(stderr, "Must run as root\n"); - exit(0); + int fd; + + fd = open(pathname, O_RDWR); + if (fd < 0) { + fprintf(stderr, "Must run as root\n"); + exit(0); + } + fprintf(stderr, "\nNot running as root, Only read only operations are supported\n"); + close(fd); + read_only = 1; } ret = update_cpu_model(); -- cgit v1.2.3 From 21adcd5ec99f342489a49e9d237a987b1bd9fab5 Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Mon, 29 Dec 2025 12:45:06 -0800 Subject: tools/power/x86/intel-speed-select: Use pkg-config for libnl-3.0 detection Replace hardcoded libnl3 include path with pkg-config detection to improve portability across different distributions and build environments. The previous implementation used a fixed path constructed from the compiler's sysroot, which could fail on systems with non-standard library installations. Now the build system: - Attempts to detect libnl-3.0 include paths using pkg-config - Falls back to /usr/include/libnl3 if pkg-config is unavailable - Maintains backward compatibility with existing build configurations This ensures the tool builds correctly on a wider range of systems while preserving existing behavior when pkg-config is not present. Closes:https://bugzilla.kernel.org/show_bug.cgi?id=220819 Signed-off-by: Khem Raj Signed-off-by: Srinivas Pandruvada --- tools/power/x86/intel-speed-select/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/Makefile b/tools/power/x86/intel-speed-select/Makefile index 8d3a02a20f3d..6b299aae2ded 100644 --- a/tools/power/x86/intel-speed-select/Makefile +++ b/tools/power/x86/intel-speed-select/Makefile @@ -13,7 +13,13 @@ endif # Do not use make's built-in rules # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r -override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include -I$(shell $(CC) -print-sysroot)/usr/include/libnl3 + +NL3_CFLAGS = $(shell pkg-config --cflags libnl-3.0 2>/dev/null) +ifeq ($(NL3_CFLAGS),) +NL3_CFLAGS = -I/usr/include/libnl3 +endif + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include $(NL3_CFLAGS) override LDFLAGS += -lnl-genl-3 -lnl-3 ALL_TARGETS := intel-speed-select -- cgit v1.2.3 From 56c17ee151c6e1a73d77e15b82a8e2130cd8dd16 Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Thu, 15 Jan 2026 15:33:33 +0530 Subject: tools/power/x86/intel-speed-select: Fix file descriptor leak in isolate_cpus() The file descriptor opened in isolate_cpus() when (!level) is true was not being closed before returning, causing a file descriptor leak in both the error path and the success path. When write() fails at line 950, the function returns at line 953 without closing the file descriptor. Similarly, on success, the function returns at line 956 without closing the file descriptor. Add close(fd) calls before both return statements to fix the resource leak. This follows the same pattern used elsewhere in the same function where file descriptors are properly closed before returning (see lines 1005 and 1027). Fixes: 997074df658e ("tools/power/x86/intel-speed-select: Use cgroup v2 isolation") Signed-off-by: Malaya Kumar Rout Signed-off-by: Srinivas Pandruvada --- tools/power/x86/intel-speed-select/isst-config.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 807feb17bb81..68c9955dd7f4 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -962,9 +962,11 @@ int isolate_cpus(struct isst_id *id, int mask_size, cpu_set_t *cpu_mask, int lev ret = write(fd, "member", strlen("member")); if (ret == -1) { printf("Can't update to member\n"); + close(fd); return ret; } + close(fd); return 0; } -- cgit v1.2.3 From 6142b726e6e64870ab0c7ffb158bffa141f83bb6 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 16 Jan 2026 15:48:50 -0800 Subject: tools/power/x86/intel-speed-select: v1.25 release This version includes the following changes: - Allow read only commands for non root users when permitted - Fix file descriptor leak in isolate_cpus() - Replace hardcoded libnl3 include path Signed-off-by: Srinivas Pandruvada --- tools/power/x86/intel-speed-select/isst-config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 68c9955dd7f4..dd9056ddb016 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -16,7 +16,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.24"; +static const char *version_str = "v1.25"; static const int supported_api_ver = 3; static struct isst_if_platform_info isst_platform_info; -- cgit v1.2.3 From d9b40d7262a227442bf402ea0708dc94f438bb52 Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Wed, 22 Oct 2025 11:59:48 +0530 Subject: selftests/x86: Add selftests include path for kselftest.h after centralization The previous change centralizing kselftest.h include path in lib.mk caused x86 selftests to fail, as x86 Makefile overwrites CFLAGS using ":=", dropping the include path added in lib.mk. Therefore, helpers.h could not find kselftest.h during compilation. Fix this by adding the tools/testing/sefltest to CFLAGS in x86 Makefile. [ bp: Correct commit ID in Fixes: ] Fixes: e6fbd1759c9e ("selftests: complete kselftest include centralization") Closes: https://lore.kernel.org/lkml/CA+G9fYvKjQcCBMfXA-z2YuL2L+3Qd-pJjEUDX8PDdz2-EEQd=Q@mail.gmail.com/T/#m83fd330231287fc9d6c921155bee16c591db7360 Reported-by: Linux Kernel Functional Testing Signed-off-by: Bala-Vignesh-Reddy Signed-off-by: Borislav Petkov (AMD) Tested-by: Anders Roxell Tested-by: Brendan Jackman Link: https://patch.msgid.link/20251022062948.162852-1-reddybalavignesh9979@gmail.com --- tools/testing/selftests/x86/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 83148875a12c..434065215d12 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -36,6 +36,7 @@ BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) CFLAGS := -O2 -g -std=gnu99 -pthread -Wall $(KHDR_INCLUDES) +CFLAGS += -I $(top_srcdir)/tools/testing/selftests/ # call32_from_64 in thunks.S uses absolute addresses. ifeq ($(CAN_BUILD_WITH_NOPIE),1) -- cgit v1.2.3 From d045e166d3c51b7aec069669bb243e057d80d04f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 15 Jan 2026 14:56:52 +0100 Subject: selftests: vDSO: getrandom: Fix path to s390 chacha implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The s390 vDSO source directory was recently moved, but this reference was not updated. Fixes: c0087d807ae8 ("s390/vdso: Rename vdso64 to vdso") Signed-off-by: Thomas Weißschuh Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S index a4a82e1c28a9..8c3cbf4dfd6a 100644 --- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S +++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S @@ -14,7 +14,7 @@ #elif defined(__riscv) && __riscv_xlen == 64 #include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S" #elif defined(__s390x__) -#include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S" +#include "../../../../arch/s390/kernel/vdso/vgetrandom-chacha.S" #elif defined(__x86_64__) #include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S" #endif -- cgit v1.2.3 From a120a832e3ebca48474d7183ddeadf4138472535 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 20 Oct 2025 15:01:16 -0700 Subject: lkdtm/bugs: Add __counted_by_ptr() test PTR_BOUNDS Provide run-time validation of the __counted_by_ptr() annotation via newly added PTR_BOUNDS LKDTM test. Link: https://patch.msgid.link/20251020220118.1226740-2-kees@kernel.org Signed-off-by: Kees Cook --- drivers/misc/lkdtm/bugs.c | 90 ++++++++++++++++++++++++++++++--- tools/testing/selftests/lkdtm/tests.txt | 2 + 2 files changed, 84 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index 502059078b45..b2aee36b956d 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -465,32 +465,32 @@ static void lkdtm_ARRAY_BOUNDS(void) pr_expected_config(CONFIG_UBSAN_BOUNDS); } -struct lkdtm_annotated { +struct lkdtm_cb_fam { unsigned long flags; int count; int array[] __counted_by(count); }; -static volatile int fam_count = 4; +static volatile int element_count = 4; static void lkdtm_FAM_BOUNDS(void) { - struct lkdtm_annotated *inst; + struct lkdtm_cb_fam *inst; - inst = kzalloc(struct_size(inst, array, fam_count + 1), GFP_KERNEL); + inst = kzalloc(struct_size(inst, array, element_count + 1), GFP_KERNEL); if (!inst) { pr_err("FAIL: could not allocate test struct!\n"); return; } - inst->count = fam_count; + inst->count = element_count; pr_info("Array access within bounds ...\n"); - inst->array[1] = fam_count; + inst->array[1] = element_count; ignored = inst->array[1]; pr_info("Array access beyond bounds ...\n"); - inst->array[fam_count] = fam_count; - ignored = inst->array[fam_count]; + inst->array[element_count] = element_count; + ignored = inst->array[element_count]; kfree(inst); @@ -505,6 +505,79 @@ static void lkdtm_FAM_BOUNDS(void) pr_expected_config(CONFIG_UBSAN_BOUNDS); } +struct lkdtm_extra { + short a, b; + u16 sixteen; + u32 bigger; + u64 biggest; +}; + +struct lkdtm_cb_ptr { + int a, b, c; + int nr_extra; + char *buf __counted_by_ptr(len); + size_t len; + struct lkdtm_extra *extra __counted_by_ptr(nr_extra); +}; + +static noinline void check_ptr_len(struct lkdtm_cb_ptr *p, size_t len) +{ + if (__member_size(p->buf) != len) + pr_err("FAIL: could not determine size of inst->buf: %zu\n", + __member_size(p->buf)); + else + pr_info("good: inst->buf length is %zu\n", len); +} + +static void lkdtm_PTR_BOUNDS(void) +{ + struct lkdtm_cb_ptr *inst; + + inst = kzalloc(sizeof(*inst), GFP_KERNEL); + if (!inst) { + pr_err("FAIL: could not allocate struct lkdtm_cb_ptr!\n"); + return; + } + + inst->buf = kzalloc(element_count, GFP_KERNEL); + if (!inst->buf) { + pr_err("FAIL: could not allocate inst->buf!\n"); + return; + } + inst->len = element_count; + + /* Double element_count */ + inst->extra = kcalloc(element_count * 2, sizeof(*inst->extra), GFP_KERNEL); + inst->nr_extra = element_count * 2; + + pr_info("Pointer access within bounds ...\n"); + check_ptr_len(inst, 4); + /* All 4 bytes */ + inst->buf[0] = 'A'; + inst->buf[1] = 'B'; + inst->buf[2] = 'C'; + inst->buf[3] = 'D'; + /* Halfway into the array */ + inst->extra[element_count].biggest = 0x1000; + + pr_info("Pointer access beyond bounds ...\n"); + ignored = inst->extra[inst->nr_extra].b; + + kfree(inst->extra); + kfree(inst->buf); + kfree(inst); + + pr_err("FAIL: survived access of invalid pointer member offset!\n"); + + if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY_PTR)) + pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by_ptr\n", + lkdtm_kernel_info); + else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS)) + pr_expected_config(CONFIG_UBSAN_TRAP); + else + pr_expected_config(CONFIG_UBSAN_BOUNDS); +} + static void lkdtm_CORRUPT_LIST_ADD(void) { /* @@ -769,6 +842,7 @@ static struct crashtype crashtypes[] = { CRASHTYPE(OVERFLOW_UNSIGNED), CRASHTYPE(ARRAY_BOUNDS), CRASHTYPE(FAM_BOUNDS), + CRASHTYPE(PTR_BOUNDS), CRASHTYPE(CORRUPT_LIST_ADD), CRASHTYPE(CORRUPT_LIST_DEL), CRASHTYPE(STACK_GUARD_PAGE_LEADING), diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index 67cd53715d93..e62b85b591be 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -11,6 +11,8 @@ EXCEPTION #CORRUPT_STACK Crashes entire system on success #CORRUPT_STACK_STRONG Crashes entire system on success ARRAY_BOUNDS call trace:|UBSAN: array-index-out-of-bounds +FAM_BOUNDS call trace:|UBSAN: array-index-out-of-bounds +PTR_BOUNDS call trace:|UBSAN: array-index-out-of-bounds CORRUPT_LIST_ADD list_add corruption CORRUPT_LIST_DEL list_del corruption STACK_GUARD_PAGE_LEADING -- cgit v1.2.3 From 68578370f9b3a2aba5964b273312d51c581b6aad Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 Jan 2026 17:24:47 +0000 Subject: tools: ynl: Specify --no-line-number in ynl-regen.sh. If grep.lineNumber is enabled in .gitconfig, [grep] lineNumber = true ynl-regen.sh fails with the following error: $ ./tools/net/ynl/ynl-regen.sh -f ... ynl_gen_c.py: error: argument --mode: invalid choice: '4:' (choose from user, kernel, uapi) GEN 4: net/ipv4/fou_nl.c Let's specify --no-line-number explicitly. Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Suggested-by: Jakub Kicinski Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260115172533.693652-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-regen.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh index 81b4ecd89100..d9809276db98 100755 --- a/tools/net/ynl/ynl-regen.sh +++ b/tools/net/ynl/ynl-regen.sh @@ -21,7 +21,7 @@ files=$(git grep --files-with-matches '^/\* YNL-GEN \(kernel\|uapi\|user\)') for f in $files; do # params: 0 1 2 3 # $YAML YNL-GEN kernel $mode - params=( $(git grep -B1 -h '/\* YNL-GEN' $f | sed 's@/\*\(.*\)\*/@\1@') ) + params=( $(git grep --no-line-number -B1 -h '/\* YNL-GEN' $f | sed 's@/\*\(.*\)\*/@\1@') ) args=$(sed -n 's@/\* YNL-ARG \(.*\) \*/@\1@p' $f) if [ $f -nt ${params[0]} -a -z "$force" ]; then -- cgit v1.2.3 From 52b4859730434902731e1cd4ea061d9611398008 Mon Sep 17 00:00:00 2001 From: Yohei Kojima Date: Tue, 13 Jan 2026 23:11:54 +0900 Subject: selftests: net: fix passive TFO test to fail if child processes failed Improve the passive TFO test to report failure if the server or the client timed out or exited with non-zero status. Before this commit, TFO test didn't fail even if exit(EXIT_FAILURE) is added to the first line of the run_server() and run_client() functions. Signed-off-by: Yohei Kojima Link: https://patch.msgid.link/214d399caec2e5de7738ced5736829915d507e4e.1768312014.git.yk@y-koj.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tfo_passive.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh index a4550511830a..f116f888b794 100755 --- a/tools/testing/selftests/net/tfo_passive.sh +++ b/tools/testing/selftests/net/tfo_passive.sh @@ -85,12 +85,15 @@ timeout -k 1s 30s ip netns exec nssv ./tfo \ -s \ -p ${SERVER_PORT} \ -o ${out_file}& +server_pid="$!" wait_local_port_listen nssv ${SERVER_PORT} tcp ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT} +client_exit_status="$?" -wait +wait "$server_pid" +server_exit_status="$?" res=$(cat $out_file) rm $out_file @@ -101,6 +104,14 @@ if [ "$res" = "0" ]; then exit 1 fi +if [ "$client_exit_status" -ne 0 ] || [ "$server_exit_status" -ne 0 ]; then + # Note: timeout(1) exits with 124 if it timed out + echo "client exited with ${client_exit_status}" + echo "server exited with ${server_exit_status}" + cleanup_ns + exit 1 +fi + echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL -- cgit v1.2.3 From 342e31254f02041e3b9d4ad573204d53b2f832c9 Mon Sep 17 00:00:00 2001 From: Yohei Kojima Date: Tue, 13 Jan 2026 23:11:55 +0900 Subject: selftests: net: improve error handling in passive TFO test Improve the error handling in passive TFO test to check the return value from sendto(), and to fail if read() or fprintf() failed. Signed-off-by: Yohei Kojima Link: https://patch.msgid.link/24707c8133f7095c0e5a94afa69e75c3a80bf6e7.1768312014.git.yk@y-koj.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tfo.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c index 8d82140f0f76..3b1ee2d3d417 100644 --- a/tools/testing/selftests/net/tfo.c +++ b/tools/testing/selftests/net/tfo.c @@ -82,8 +82,10 @@ static void run_server(void) error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)"); if (read(connfd, buf, 64) < 0) - perror("read()"); - fprintf(outfile, "%d\n", opt); + error(1, errno, "read()"); + + if (fprintf(outfile, "%d\n", opt) < 0) + error(1, errno, "fprintf()"); fclose(outfile); close(connfd); @@ -92,14 +94,17 @@ static void run_server(void) static void run_client(void) { - int fd; + int fd, ret; char *msg = "Hello, world!"; fd = socket(AF_INET6, SOCK_STREAM, 0); if (fd == -1) error(1, errno, "socket()"); - sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + ret = sendto(fd, msg, strlen(msg), MSG_FASTOPEN, + (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + if (ret < 0) + error(1, errno, "sendto()"); close(fd); } -- cgit v1.2.3 From 1deecf7805f16cbcb3541cc57d8478b8b992a2ab Mon Sep 17 00:00:00 2001 From: LeeYongjun Date: Sun, 18 Jan 2026 15:55:10 +0900 Subject: selftests: ALSA: Remove unused variable in utimer-test The variable 'i' in wrong_timers_test() is declared but never used. This was detected by Cppcheck static analysis. tools/testing/selftests/alsa/utimer-test.c:144:9: style: Unused variable: i [unusedVariable] Remove it to clean up the code and silence the warning. Signed-off-by: LeeYongjun Link: https://patch.msgid.link/20260118065510.29644-1-jun85566@gmail.com Signed-off-by: Takashi Iwai --- tools/testing/selftests/alsa/utimer-test.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/alsa/utimer-test.c b/tools/testing/selftests/alsa/utimer-test.c index c45cb226bd8f..d221972cd8fb 100644 --- a/tools/testing/selftests/alsa/utimer-test.c +++ b/tools/testing/selftests/alsa/utimer-test.c @@ -141,7 +141,6 @@ TEST_F(timer_f, utimer) { TEST(wrong_timers_test) { int timer_dev_fd; int utimer_fd; - size_t i; struct snd_timer_uinfo wrong_timer = { .resolution = 0, .id = UTIMER_DEFAULT_ID, -- cgit v1.2.3 From 59cac9d52b885cbeba45fa455417b03dfb03eaa7 Mon Sep 17 00:00:00 2001 From: UYeol Jo Date: Mon, 12 Jan 2026 06:01:26 +0900 Subject: selftests/x86: Clean up sysret_rip coding style Tidy up sysret_rip style (cast spacing, main(void), const placement). No functional change intended. Signed-off-by: UYeol Jo Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260111210126.74752-1-jouyeol8739@gmail.com --- tools/testing/selftests/x86/sysret_rip.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 5fb531e3ad7c..2e423a335e1c 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -31,7 +31,7 @@ void test_syscall_ins(void); extern const char test_page[]; -static void const *current_test_page_addr = test_page; +static const void *current_test_page_addr = test_page; /* State used by our signal handlers. */ static gregset_t initial_regs; @@ -40,7 +40,7 @@ static volatile unsigned long rip; static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n", @@ -56,7 +56,7 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) static void sigusr1(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); @@ -69,8 +69,6 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) ctx->uc_mcontext.gregs[REG_R11]); sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND); - - return; } static void test_sigreturn_to(unsigned long ip) @@ -84,7 +82,7 @@ static jmp_buf jmpbuf; static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n", @@ -130,7 +128,7 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); } -int main() +int main(void) { /* * When the kernel returns from a slow-path syscall, it will -- cgit v1.2.3 From f93bc869825fdba3632ff6ddece4906a6673e679 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 Jan 2026 15:30:35 +0100 Subject: selftests: add dm-verity keyring selftests Add selftests that verify the keyring behaves correctly. For simplicity this works with dm-verity as a module. Signed-off-by: Christian Brauner Signed-off-by: Mikulas Patocka --- tools/testing/selftests/dm-verity/Makefile | 5 + tools/testing/selftests/dm-verity/config | 10 + .../selftests/dm-verity/test-dm-verity-keyring.sh | 873 +++++++++++++++++++++ 3 files changed, 888 insertions(+) create mode 100644 tools/testing/selftests/dm-verity/Makefile create mode 100644 tools/testing/selftests/dm-verity/config create mode 100755 tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh (limited to 'tools') diff --git a/tools/testing/selftests/dm-verity/Makefile b/tools/testing/selftests/dm-verity/Makefile new file mode 100644 index 000000000000..b75ee08a54af --- /dev/null +++ b/tools/testing/selftests/dm-verity/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := test-dm-verity-keyring.sh + +include ../lib.mk diff --git a/tools/testing/selftests/dm-verity/config b/tools/testing/selftests/dm-verity/config new file mode 100644 index 000000000000..1cd3712fa0a4 --- /dev/null +++ b/tools/testing/selftests/dm-verity/config @@ -0,0 +1,10 @@ +CONFIG_BLK_DEV_DM=y +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_MODULE_UNLOAD=y +CONFIG_KEYS=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_SYSTEM_DATA_VERIFICATION=y diff --git a/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh new file mode 100755 index 000000000000..1f9601ef22f8 --- /dev/null +++ b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh @@ -0,0 +1,873 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test script for dm-verity keyring functionality +# +# This script has two modes depending on kernel configuration: +# +# 1. keyring_unsealed=1 AND require_signatures=1: +# - Upload a test key to the .dm-verity keyring +# - Seal the keyring +# - Create a dm-verity device with a signed root hash +# - Verify signature verification works +# +# 2. keyring_unsealed=0 (default) OR require_signatures=0: +# - Verify the keyring is already sealed (if unsealed=0) +# - Verify keys cannot be added to a sealed keyring +# - Verify the keyring is inactive (not used for verification) +# +# Requirements: +# - Root privileges +# - openssl +# - veritysetup (cryptsetup) +# - keyctl (keyutils) + +set -e + +WORK_DIR="" +DATA_DEV="" +HASH_DEV="" +DM_NAME="verity-test-$$" +CLEANUP_DONE=0 + +# Module parameters (detected at runtime) +KEYRING_UNSEALED="" +REQUIRE_SIGNATURES="" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $*" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $*" +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $*" >&2 +} + +log_skip() { + echo -e "${YELLOW}[SKIP]${NC} $*" +} + +cleanup() { + if [ "$CLEANUP_DONE" -eq 1 ]; then + return + fi + CLEANUP_DONE=1 + + log_info "Cleaning up..." + + # Remove dm-verity device if it exists + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi + + # Detach loop devices + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + fi + + # Remove work directory + if [ -n "$WORK_DIR" ] && [ -d "$WORK_DIR" ]; then + rm -rf "$WORK_DIR" + fi +} + +trap cleanup EXIT + +die() { + log_error "$*" + exit 1 +} + +find_dm_verity_keyring() { + # The .dm-verity keyring is not linked to user-accessible keyrings, + # so we need to find it via /proc/keys + local serial_hex + serial_hex=$(awk '/\.dm-verity/ {print $1}' /proc/keys 2>/dev/null) + + if [ -z "$serial_hex" ]; then + return 1 + fi + + # Convert hex to decimal for keyctl + echo $((16#$serial_hex)) +} + +get_module_param() { + local param="$1" + local path="/sys/module/dm_verity/parameters/$param" + + if [ -f "$path" ]; then + cat "$path" + else + echo "" + fi +} + +check_requirements() { + log_info "Checking requirements..." + + # Check for root + if [ "$(id -u)" -ne 0 ]; then + die "This script must be run as root" + fi + + # Check for required tools + for cmd in openssl veritysetup keyctl losetup dmsetup dd awk; do + if ! command -v "$cmd" &>/dev/null; then + die "Required command not found: $cmd" + fi + done + + # Check for dm-verity module + if ! modprobe -n dm-verity &>/dev/null; then + die "dm-verity module not available" + fi + + # Verify OpenSSL can create signatures + # OpenSSL cms -sign with -binary -outform DER creates detached signatures by default + log_info "Using OpenSSL for PKCS#7 signatures" +} + +load_dm_verity_module() { + local keyring_unsealed="${1:-0}" + local require_signatures="${2:-0}" + + log_info "Loading dm-verity module with keyring_unsealed=$keyring_unsealed require_signatures=$require_signatures" + + # Unload if already loaded + if lsmod | grep -q '^dm_verity'; then + log_info "Unloading existing dm-verity module..." + modprobe -r dm-verity 2>/dev/null || \ + die "Failed to unload dm-verity module (may be in use)" + sleep 1 + fi + + # Load with specified parameters + modprobe dm-verity keyring_unsealed="$keyring_unsealed" require_signatures="$require_signatures" || \ + die "Failed to load dm-verity module" + + # Wait for keyring to be created (poll with timeout) + local keyring_id="" + local timeout=50 # 5 seconds (50 * 0.1s) + while [ $timeout -gt 0 ]; do + keyring_id=$(find_dm_verity_keyring) && break + sleep 0.1 + timeout=$((timeout - 1)) + done + + if [ -z "$keyring_id" ]; then + die "dm-verity keyring not found after module load (timeout)" + fi + + log_info "Found .dm-verity keyring: $keyring_id" + echo "$keyring_id" > "$WORK_DIR/keyring_id" + + # Read and display module parameters + KEYRING_UNSEALED=$(get_module_param "keyring_unsealed") + REQUIRE_SIGNATURES=$(get_module_param "require_signatures") + + log_info "Module parameters:" + log_info " keyring_unsealed=$KEYRING_UNSEALED" + log_info " require_signatures=$REQUIRE_SIGNATURES" +} + +unload_dm_verity_module() { + log_info "Unloading dm-verity module..." + + # Clean up any dm-verity devices first + local dm_dev + while read -r dm_dev _; do + [ -n "$dm_dev" ] || continue + log_info "Removing dm-verity device: $dm_dev" + dmsetup remove "$dm_dev" 2>/dev/null || true + done < <(dmsetup ls --target verity 2>/dev/null) + + if lsmod | grep -q '^dm_verity'; then + modprobe -r dm-verity 2>/dev/null || \ + log_warn "Failed to unload dm-verity module" + sleep 1 + fi +} + +generate_keys() { + log_info "Generating signing key pair..." + + # Generate private key (2048-bit for faster test execution) + openssl genrsa -out "$WORK_DIR/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # The kernel requires digitalSignature key usage for signature verification + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$WORK_DIR/openssl.cnf" << 'EOF' +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-key + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$WORK_DIR/private.pem" \ + -out "$WORK_DIR/cert.pem" -days 365 \ + -config "$WORK_DIR/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$WORK_DIR/cert.pem" -outform DER \ + -out "$WORK_DIR/cert.der" + + # Show certificate info for debugging + log_info "Certificate details:" + openssl x509 -in "$WORK_DIR/cert.pem" -noout -text 2>/dev/null | \ + grep -E "Subject:|Issuer:|Key Usage|Extended" | head -10 + + log_info "Keys generated successfully" +} + +seal_keyring() { + log_info "Sealing the .dm-verity keyring..." + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + keyctl restrict_keyring "$keyring_id" || \ + die "Failed to seal keyring" + + log_info "Keyring sealed successfully" +} + +create_test_device() { + log_info "Creating test device images..." + + # Create data image with random content (8MB is sufficient for testing) + dd if=/dev/urandom of="$WORK_DIR/data.img" bs=1M count=8 status=none + + # Create hash image (will be populated by veritysetup) + dd if=/dev/zero of="$WORK_DIR/hash.img" bs=1M count=1 status=none + + # Setup loop devices + DATA_DEV=$(losetup --find --show "$WORK_DIR/data.img") + HASH_DEV=$(losetup --find --show "$WORK_DIR/hash.img") + + log_info "Data device: $DATA_DEV" + log_info "Hash device: $HASH_DEV" +} + +create_verity_hash() { + log_info "Creating dm-verity hash tree..." + + local root_hash output + output=$(veritysetup format "$DATA_DEV" "$HASH_DEV" 2>&1) + root_hash=$(echo "$output" | grep "Root hash:" | awk '{print $3}') + + if [ -z "$root_hash" ]; then + log_error "veritysetup format output:" + echo "$output" | sed 's/^/ /' + die "Failed to get root hash from veritysetup format" + fi + + echo "$root_hash" > "$WORK_DIR/root_hash" + log_info "Root hash: $root_hash" +} + +create_detached_signature() { + local infile="$1" + local outfile="$2" + local cert="$3" + local key="$4" + + # Use openssl smime (not cms) for PKCS#7 signatures compatible with kernel + # Flags from working veritysetup example: + # -nocerts: don't include certificate in signature + # -noattr: no signed attributes + # -binary: binary input mode + if openssl smime -sign -nocerts -noattr -binary \ + -in "$infile" \ + -inkey "$key" \ + -signer "$cert" \ + -outform der \ + -out "$outfile" 2>/dev/null; then + return 0 + fi + + log_error "Failed to create signature" + return 1 +} + +activate_verity_device() { + local with_sig="$1" + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Clear dmesg and capture any kernel messages during activation + dmesg -C 2>/dev/null || true + + if [ "$with_sig" = "yes" ]; then + log_info "Activating dm-verity device with signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" \ + --root-hash-signature="$WORK_DIR/root_hash.p7s" 2>&1 + local ret=$? + else + log_info "Activating dm-verity device without signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" 2>&1 + local ret=$? + fi + + # Show relevant kernel messages + local kmsg + kmsg=$(dmesg 2>/dev/null | grep -i -E 'verity|pkcs|signature|asymmetric|key' | tail -10) + if [ -n "$kmsg" ]; then + log_info "Kernel messages:" + echo "$kmsg" | while read -r line; do echo " $line"; done + fi + + return $ret +} + +deactivate_verity_device() { + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi +} + +show_keyring_status() { + log_info "Keyring status:" + + local keyring_id + keyring_id=$(find_dm_verity_keyring) || true + + if [ -n "$keyring_id" ]; then + echo " Keyring ID: $keyring_id" + keyctl show "$keyring_id" 2>/dev/null || true + grep '\.dm-verity' /proc/keys 2>/dev/null || true + fi +} + +list_keyring_keys() { + log_info "Keys in .dm-verity keyring:" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id" 2>/dev/null) || \ + keyring_id=$(find_dm_verity_keyring) || true + + if [ -z "$keyring_id" ]; then + log_warn "Could not find keyring" + return + fi + + # List all keys in the keyring + local keys + keys=$(keyctl list "$keyring_id" 2>/dev/null) + if [ -z "$keys" ] || [ "$keys" = "keyring is empty" ]; then + echo " (empty)" + else + echo "$keys" | while read -r line; do + echo " $line" + done + + # Show detailed info for each key + log_info "Key details:" + keyctl list "$keyring_id" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+$' | while read -r key_id; do + echo " Key $key_id:" + keyctl describe "$key_id" 2>/dev/null | sed 's/^/ /' + done + fi +} + +generate_named_key() { + local name="$1" + local key_dir="$WORK_DIR/keys/$name" + + mkdir -p "$key_dir" + + # Log to stderr so it doesn't interfere with return value + echo "[INFO] Generating key pair: $name" >&2 + + # Generate private key + openssl genrsa -out "$key_dir/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$key_dir/openssl.cnf" << EOF +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-$name + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$key_dir/private.pem" \ + -out "$key_dir/cert.pem" -days 365 \ + -config "$key_dir/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$key_dir/cert.pem" -outform DER \ + -out "$key_dir/cert.der" + + # Return the key directory path (only this goes to stdout) + echo "$key_dir" +} + +upload_named_key() { + local name="$1" + local key_dir="$2" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Uploading key '$name' to keyring..." + + local key_id + if key_id=$(keyctl padd asymmetric "$name" "$keyring_id" \ + < "$key_dir/cert.der" 2>&1); then + log_info "Key '$name' uploaded with ID: $key_id" + echo "$key_id" > "$key_dir/key_id" + return 0 + else + log_error "Failed to upload key '$name': $key_id" + return 1 + fi +} + +# +# Test: Verify sealed keyring rejects key additions +# +test_sealed_keyring_rejects_keys() { + log_info "TEST: Verify sealed keyring rejects key additions" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + generate_keys + + # Try to add a key - should fail + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Key addition should have been rejected on sealed keyring" + return 1 + else + log_pass "Sealed keyring correctly rejected key addition" + return 0 + fi +} + +# +# Test: Multiple keys in keyring +# +test_multiple_keys() { + log_info "TEST: Multiple keys in keyring" + + local key1_dir key2_dir key3_dir + + # Generate three different keys + key1_dir=$(generate_named_key "vendor-a") + key2_dir=$(generate_named_key "vendor-b") + key3_dir=$(generate_named_key "vendor-c") + + # Upload all three keys + upload_named_key "vendor-a" "$key1_dir" || return 1 + upload_named_key "vendor-b" "$key2_dir" || return 1 + upload_named_key "vendor-c" "$key3_dir" || return 1 + + log_info "" + log_info "Keys in keyring before sealing:" + list_keyring_keys + show_keyring_status + + # Seal the keyring + log_info "" + seal_keyring + + # List keys after sealing + log_info "" + log_info "Keys in keyring after sealing:" + list_keyring_keys + show_keyring_status + + log_pass "Key upload and keyring sealing succeeded" + + # Create test device + log_info "" + create_test_device + create_verity_hash + + # Test 1: Sign with key1, should verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-a key" + if ! sign_root_hash_with_key "$key1_dir"; then + log_fail "Failed to sign with vendor-a key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-a key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-a key should succeed" + return 1 + fi + + # Test 2: Sign with key2, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-b key" + if ! sign_root_hash_with_key "$key2_dir"; then + log_fail "Failed to sign with vendor-b key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-b key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-b key should succeed" + return 1 + fi + + # Test 3: Sign with key3, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-c key" + if ! sign_root_hash_with_key "$key3_dir"; then + log_fail "Failed to sign with vendor-c key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-c key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-c key should succeed" + return 1 + fi + + # Test 4: Generate a key NOT in the keyring, should fail + log_info "" + log_info "Sub-test: Verify with unknown key (should fail)" + local unknown_key_dir + unknown_key_dir=$(generate_named_key "unknown-vendor") + if ! sign_root_hash_with_key "$unknown_key_dir"; then + log_fail "Failed to sign with unknown-vendor key" + return 1 + fi + if activate_verity_device "yes"; then + log_fail "Verification with unknown key should fail" + deactivate_verity_device + return 1 + else + log_pass "Verification with unknown key correctly rejected" + fi + + log_info "" + log_pass "Multiple keys test completed successfully" + return 0 +} + +sign_root_hash_with_key() { + local key_dir="$1" + + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Create the data to sign (hex string, not binary) + echo -n "$root_hash" > "$WORK_DIR/root_hash.txt" + + # Debug: show exactly what we're signing + log_info "Root hash (hex): $root_hash" + log_info "Root hash hex string size: $(wc -c < "$WORK_DIR/root_hash.txt") bytes" + + # Create detached PKCS#7 signature + if ! create_detached_signature "$WORK_DIR/root_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem"; then + log_error "Failed to sign root hash with key from $key_dir" + return 1 + fi + + # Debug: show signing certificate info + log_info "Signed with certificate:" + openssl x509 -in "$key_dir/cert.pem" -noout -subject 2>/dev/null | sed 's/^/ /' + + # Debug: verify signature locally + # -nointern: cert not in signature, use -certfile + # -noverify: skip certificate chain validation (self-signed) + if openssl smime -verify -binary -inform der -nointern -noverify \ + -in "$WORK_DIR/root_hash.p7s" \ + -content "$WORK_DIR/root_hash.txt" \ + -certfile "$key_dir/cert.pem" \ + -out /dev/null 2>/dev/null; then + log_info "Local signature verification: PASSED" + else + log_warn "Local signature verification: FAILED" + fi + return 0 +} + +# +# Test: Verify corrupted signatures are rejected +# +test_corrupted_signature() { + log_info "TEST: Verify corrupted signatures are rejected" + + # This test requires a valid setup from test_multiple_keys or similar + # It modifies the signature file and verifies rejection + + if [ ! -f "$WORK_DIR/root_hash.p7s" ]; then + log_warn "No signature file found, skipping corrupted signature test" + return 0 + fi + + # Save original signature + cp "$WORK_DIR/root_hash.p7s" "$WORK_DIR/root_hash.p7s.orig" + + # Test 1: Truncated signature + log_info "Sub-test: Truncated signature (should fail)" + head -c 100 "$WORK_DIR/root_hash.p7s.orig" > "$WORK_DIR/root_hash.p7s" + if activate_verity_device "yes"; then + log_fail "Truncated signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Truncated signature correctly rejected" + fi + + # Test 2: Corrupted signature (flip some bytes) + log_info "Sub-test: Corrupted signature bytes (should fail)" + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + # Corrupt bytes in the middle of the signature + local sig_size + sig_size=$(wc -c < "$WORK_DIR/root_hash.p7s") + local corrupt_offset=$((sig_size / 2)) + printf '\xff\xff\xff\xff' | dd of="$WORK_DIR/root_hash.p7s" bs=1 seek=$corrupt_offset conv=notrunc 2>/dev/null + if activate_verity_device "yes"; then + log_fail "Corrupted signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Corrupted signature correctly rejected" + fi + + # Test 3: Signature over wrong data (sign different content) + log_info "Sub-test: Signature over wrong data (should fail)" + # Create a different root hash (all zeros as hex string) + printf '%064d' 0 > "$WORK_DIR/wrong_hash.txt" + # Get the first key directory that was used + local key_dir="$WORK_DIR/keys/vendor-a" + if [ -d "$key_dir" ]; then + create_detached_signature "$WORK_DIR/wrong_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem" + if activate_verity_device "yes"; then + log_fail "Signature over wrong data should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Signature over wrong data correctly rejected" + fi + else + log_warn "Key directory not found, skipping wrong data test" + fi + + # Restore original signature + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + + log_pass "Corrupted signature test completed successfully" + return 0 +} + +# +# Test: Verify keyring is sealed when keyring_unsealed=0 +# +test_keyring_sealed_by_default() { + log_info "TEST: Verify keyring is sealed by default (keyring_unsealed=0)" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Current keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + generate_keys + + # Try to add a key - should fail if keyring is sealed + log_info "Attempting to add key to sealed keyring..." + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Keyring should be sealed when keyring_unsealed=0" + list_keyring_keys + return 1 + else + log_pass "Keyring is correctly sealed when keyring_unsealed=0" + log_info "Keyring state after failed add attempt:" + list_keyring_keys + return 0 + fi +} + +# +# Test: Verify dm-verity keyring is inactive when sealed empty +# +test_keyring_inactive_when_empty() { + log_info "TEST: Verify dm-verity keyring is inactive when sealed empty" + + # When keyring_unsealed=0, the keyring is sealed immediately while empty + # This means it should NOT be used for verification (nr_leaves_on_tree=0) + + log_info "Keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + create_test_device + create_verity_hash + + # Without any keys in the dm-verity keyring, and with it sealed, + # verification should fall through to the secondary/platform keyrings + # and likely succeed (if require_signatures=0) or fail (if =1) + + log_info "Sub-test: Device activation with sealed empty keyring" + if [ "$REQUIRE_SIGNATURES" = "Y" ] || [ "$REQUIRE_SIGNATURES" = "1" ]; then + if activate_verity_device "no"; then + log_fail "Device should NOT activate without signature when require_signatures=1" + deactivate_verity_device + return 1 + else + log_pass "Device correctly rejected (require_signatures=1, no valid signature)" + fi + else + if activate_verity_device "no"; then + log_pass "Device activated (require_signatures=0, empty dm-verity keyring is inactive)" + deactivate_verity_device + else + log_fail "Device should activate when require_signatures=0" + return 1 + fi + fi + + return 0 +} + +main() { + local rc=0 + + log_info "=== dm-verity keyring test ===" + log_info "" + + # Create work directory + WORK_DIR=$(mktemp -d -t dm-verity-test.XXXXXX) + log_info "Work directory: $WORK_DIR" + + check_requirements + + # + # Test 1: UNSEALED keyring mode (keyring_unsealed=1) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: UNSEALED KEYRING ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 1 1 # keyring_unsealed=1, require_signatures=1 + show_keyring_status + + log_info "" + if ! test_multiple_keys; then + rc=1 + fi + + # After sealing, verify it rejects new keys + log_info "" + if ! test_sealed_keyring_rejects_keys; then + rc=1 + fi + + # Test corrupted signatures are rejected + log_info "" + if ! test_corrupted_signature; then + rc=1 + fi + + # Clean up devices before reloading module + deactivate_verity_device + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + DATA_DEV="" + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + HASH_DEV="" + fi + + # + # Test 2: SEALED keyring mode (keyring_unsealed=0, default) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: SEALED KEYRING (default) ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 0 0 # keyring_unsealed=0, require_signatures=0 + show_keyring_status + + log_info "" + if ! test_keyring_sealed_by_default; then + rc=1 + fi + + log_info "" + if ! test_keyring_inactive_when_empty; then + rc=1 + fi + + # + # Summary + # + log_info "" + log_info "========================================" + if [ $rc -eq 0 ]; then + log_info "=== All tests PASSED ===" + else + log_error "=== Some tests FAILED ===" + fi + log_info "========================================" + + return $rc +} + +main "$@" -- cgit v1.2.3 From 03b7c2d763c907f508edf8c317c0e920ce072a33 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Wed, 14 Jan 2026 10:57:16 -0800 Subject: vfio: selftests: Centralize IOMMU mode name definitions Replace scattered string literals with MODE_* macros in iommu.h. This provides a single source of truth for IOMMU mode name strings. Signed-off-by: Alex Mastro Reviewed-by: David Matlack Tested-by: David Matlack Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-1-44e036d95e64@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/libvfio/iommu.h | 6 ++++++ tools/testing/selftests/vfio/lib/iommu.c | 12 ++++++------ tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h index 5c9b9dc6d993..e9a3386a4719 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -61,6 +61,12 @@ iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); +#define MODE_VFIO_TYPE1_IOMMU "vfio_type1_iommu" +#define MODE_VFIO_TYPE1V2_IOMMU "vfio_type1v2_iommu" +#define MODE_IOMMUFD_COMPAT_TYPE1 "iommufd_compat_type1" +#define MODE_IOMMUFD_COMPAT_TYPE1V2 "iommufd_compat_type1v2" +#define MODE_IOMMUFD "iommufd" + /* * Generator for VFIO selftests fixture variants that replicate across all * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c index 58b7fb7430d4..035dac069d60 100644 --- a/tools/testing/selftests/vfio/lib/iommu.c +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -20,32 +20,32 @@ #include "../../../kselftest.h" #include -const char *default_iommu_mode = "iommufd"; +const char *default_iommu_mode = MODE_IOMMUFD; /* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ static const struct iommu_mode iommu_modes[] = { { - .name = "vfio_type1_iommu", + .name = MODE_VFIO_TYPE1_IOMMU, .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1_IOMMU, }, { - .name = "vfio_type1v2_iommu", + .name = MODE_VFIO_TYPE1V2_IOMMU, .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1v2_IOMMU, }, { - .name = "iommufd_compat_type1", + .name = MODE_IOMMUFD_COMPAT_TYPE1, .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1_IOMMU, }, { - .name = "iommufd_compat_type1v2", + .name = MODE_IOMMUFD_COMPAT_TYPE1V2, .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1v2_IOMMU, }, { - .name = "iommufd", + .name = MODE_IOMMUFD, }, }; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 3bf984b337ac..3d2f44f9c62f 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -165,7 +165,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) * IOMMUFD compatibility-mode does not support huge mappings when * using VFIO_TYPE1_IOMMU. */ - if (!strcmp(variant->iommu_mode, "iommufd_compat_type1")) + if (!strcmp(variant->iommu_mode, MODE_IOMMUFD_COMPAT_TYPE1)) mapping_size = SZ_4K; ASSERT_EQ(0, rc); -- cgit v1.2.3 From 557dbdf6c4e9c2dc3d4a4476c67ef14dca32378d Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Wed, 14 Jan 2026 10:57:17 -0800 Subject: vfio: selftests: Align BAR mmaps for efficient IOMMU mapping Update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings. The manual mmap alignment can be removed once mmap(!MAP_FIXED) on vfio device fds improves to automatically return well-aligned addresses. Also add MADV_HUGEPAGE, which encourages the kernel to use huge pages (e.g. when /sys/kernel/mm/transparent_hugepage/enabled is set to "madvise"). Drop MAP_FILE from mmap(). It is an ignored compatibility flag. Signed-off-by: Alex Mastro Reviewed-by: David Matlack Tested-by: David Matlack Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-2-44e036d95e64@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/libvfio.h | 9 ++++++++ tools/testing/selftests/vfio/lib/libvfio.c | 25 ++++++++++++++++++++++ tools/testing/selftests/vfio/lib/vfio_pci_device.c | 24 ++++++++++++++++++++- 3 files changed, 57 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h index 279ddcd70194..1b6da54cc2cb 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -23,4 +23,13 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]); char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); +/* + * Reserve virtual address space of size at an address satisfying + * (vaddr % align) == offset. + * + * Returns the reserved vaddr. The caller is responsible for unmapping + * the returned region. + */ +void *mmap_reserve(size_t size, size_t align, size_t offset); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c index a23a3cc5be69..3a3d1ed635c1 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.c +++ b/tools/testing/selftests/vfio/lib/libvfio.c @@ -2,6 +2,9 @@ #include #include +#include + +#include #include "../../../kselftest.h" #include @@ -76,3 +79,25 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]) return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; } + +void *mmap_reserve(size_t size, size_t align, size_t offset) +{ + void *map_base, *map_align; + size_t delta; + + VFIO_ASSERT_GT(align, offset); + delta = align - offset; + + map_base = mmap(NULL, size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + VFIO_ASSERT_NE(map_base, MAP_FAILED); + + map_align = (void *)(ALIGN((uintptr_t)map_base + delta, align) - delta); + + if (map_align > map_base) + VFIO_ASSERT_EQ(munmap(map_base, map_align - map_base), 0); + + VFIO_ASSERT_EQ(munmap(map_align + size, map_base + align - map_align), 0); + + return map_align; +} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index fac4c0ecadef..4e5871f1ebc3 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -11,10 +11,14 @@ #include #include +#include #include +#include #include +#include #include #include +#include #include #include @@ -123,20 +127,38 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index]; + size_t align, size; int prot = 0; + void *vaddr; VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP); + VFIO_ASSERT_TRUE(is_power_of_2(bar->info.size)); if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE; - bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED, + size = bar->info.size; + + /* + * Align BAR mmaps to improve page fault granularity during potential + * subsequent IOMMU mapping of these BAR vaddr. 1G for x86 is the + * largest hugepage size across any architecture, so no benefit from + * larger alignment. BARs smaller than 1G will be aligned by their + * power-of-two size, guaranteeing sufficient alignment for smaller + * hugepages, if present. + */ + align = min_t(size_t, size, SZ_1G); + + vaddr = mmap_reserve(size, align, 0); + bar->vaddr = mmap(vaddr, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED); + + madvise(bar->vaddr, size, MADV_HUGEPAGE); } static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) -- cgit v1.2.3 From 080723f4d4c3c6fb0720aae614deb1f30ee9ef2e Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Wed, 14 Jan 2026 10:57:18 -0800 Subject: vfio: selftests: Add vfio_dma_mapping_mmio_test Test IOMMU mapping the BAR mmaps created during vfio_pci_device_setup(). All IOMMU modes are tested: vfio_type1 variants are expected to succeed, while non-type1 modes are expected to fail. iommufd compat mode can be updated to expect success once kernel support lands. Native iommufd will not support mapping vaddrs backed by MMIO (it will support dma-buf based MMIO mapping instead). Signed-off-by: Alex Mastro Reviewed-by: David Matlack Tested-by: David Matlack Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-3-44e036d95e64@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + .../selftests/vfio/vfio_dma_mapping_mmio_test.c | 143 +++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c (limited to 'tools') diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 3c796ca99a50..ead27892ab65 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,5 +1,6 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test +TEST_GEN_PROGS += vfio_dma_mapping_mmio_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c new file mode 100644 index 000000000000..957a89ce7b3a --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +static struct vfio_pci_bar *largest_mapped_bar(struct vfio_pci_device *device) +{ + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + struct vfio_pci_bar *largest = NULL; + u64 bar_size = 0; + + for (int i = 0; i < PCI_STD_NUM_BARS; i++) { + struct vfio_pci_bar *bar = &device->bars[i]; + + if (!bar->vaddr) + continue; + + /* + * iommu_map() maps with READ|WRITE, so require the same + * abilities for the underlying VFIO region. + */ + if ((bar->info.flags & flags) != flags) + continue; + + if (bar->info.size > bar_size) { + bar_size = bar->info.size; + largest = bar; + } + } + + return largest; +} + +FIXTURE(vfio_dma_mapping_mmio_test) { + struct iommu *iommu; + struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; + struct vfio_pci_bar *bar; +}; + +FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE + +FIXTURE_SETUP(vfio_dma_mapping_mmio_test) +{ + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); + self->bar = largest_mapped_bar(self->device); + + if (!self->bar) + SKIP(return, "No mappable BAR found on device %s", device_bdf); +} + +FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test) +{ + iova_allocator_cleanup(self->iova_allocator); + vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); +} + +static void do_mmio_map_test(struct iommu *iommu, + struct iova_allocator *iova_allocator, + void *vaddr, size_t size) +{ + struct dma_region region = { + .vaddr = vaddr, + .size = size, + .iova = iova_allocator_alloc(iova_allocator, size), + }; + + /* + * NOTE: Check for iommufd compat success once it lands. Native iommufd + * will never support this. + */ + if (!strcmp(iommu->mode->name, MODE_VFIO_TYPE1V2_IOMMU) || + !strcmp(iommu->mode->name, MODE_VFIO_TYPE1_IOMMU)) { + iommu_map(iommu, ®ion); + iommu_unmap(iommu, ®ion); + } else { + VFIO_ASSERT_NE(__iommu_map(iommu, ®ion), 0); + VFIO_ASSERT_NE(__iommu_unmap(iommu, ®ion, NULL), 0); + } +} + +TEST_F(vfio_dma_mapping_mmio_test, map_full_bar) +{ + do_mmio_map_test(self->iommu, self->iova_allocator, + self->bar->vaddr, self->bar->info.size); +} + +TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar) +{ + if (self->bar->info.size < 2 * getpagesize()) + SKIP(return, "BAR too small (size=0x%llx)", self->bar->info.size); + + do_mmio_map_test(self->iommu, self->iova_allocator, + self->bar->vaddr, getpagesize()); +} + +/* Test IOMMU mapping of BAR mmap with intentionally poor vaddr alignment. */ +TEST_F(vfio_dma_mapping_mmio_test, map_bar_misaligned) +{ + /* Limit size to bound test time for large BARs */ + size_t size = min_t(size_t, self->bar->info.size, SZ_1G); + void *vaddr; + + vaddr = mmap_reserve(size, SZ_1G, getpagesize()); + vaddr = mmap(vaddr, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, + self->device->fd, self->bar->info.offset); + VFIO_ASSERT_NE(vaddr, MAP_FAILED); + + do_mmio_map_test(self->iommu, self->iova_allocator, vaddr, size); + + VFIO_ASSERT_EQ(munmap(vaddr, size), 0); +} + +int main(int argc, char *argv[]) +{ + device_bdf = vfio_selftests_get_bdf(&argc, argv); + return test_harness_run(argc, argv); +} -- cgit v1.2.3 From 1c588bca3bd5b39c93a28a5986bf82ebfb05eec2 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 14 Jan 2026 21:12:52 +0000 Subject: vfio: selftests: Drop IOMMU mapping size assertions for VFIO_TYPE1_IOMMU Drop the assertions about IOMMU mappings sizes for VFIO_TYPE1_IOMMU modes (both the VFIO mode and the iommufd compatibility mode). These assertions fail when CONFIG_IOMMUFD_VFIO_CONTAINER is enabled, since iommufd compatibility mode provides different huge page behavior than VFIO for VFIO_TYPE1_IOMMU. VFIO_TYPE1_IOMMU is an old enough interface that it's not worth changing the behavior of VFIO and iommufd to match nor care about the IOMMU mapping sizes. Cc: Jason Gunthorpe Link: https://lore.kernel.org/kvm/20260109143830.176dc279@shazbot.org/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20260114211252.2581145-1-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 3d2f44f9c62f..abb170bdcef7 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -161,12 +161,8 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) if (rc == -EOPNOTSUPP) goto unmap; - /* - * IOMMUFD compatibility-mode does not support huge mappings when - * using VFIO_TYPE1_IOMMU. - */ - if (!strcmp(variant->iommu_mode, MODE_IOMMUFD_COMPAT_TYPE1)) - mapping_size = SZ_4K; + if (self->iommu->mode->iommu_type == VFIO_TYPE1_IOMMU) + goto unmap; ASSERT_EQ(0, rc); printf("Found IOMMU mappings for IOVA 0x%lx:\n", region.iova); -- cgit v1.2.3 From 8becfe16e4a12218c703a98f5bfc15b6f0fbd99c Mon Sep 17 00:00:00 2001 From: Dmitry Skorodumov Date: Mon, 12 Jan 2026 17:24:07 +0300 Subject: selftests: net: simple selftest for ipvtap This is a simple ipvtap test to test handling IP-address add/remove on ipvlan interface. It creates a veth-interface and then creates several network-namespace with ipvlan0 interface in it linked to veth. Then it starts to add/remove addresses on ipvlan0 interfaces in several threads. At finish, it checks that there is no duplicated addresses. Signed-off-by: Dmitry Skorodumov Link: https://patch.msgid.link/20260112142417.4039566-3-skorodumov.dmitry@huawei.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 2 + tools/testing/selftests/net/ipvtap_test.sh | 168 +++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100755 tools/testing/selftests/net/ipvtap_test.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b66ba04f19d9..45c4ea381bc3 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -48,6 +48,7 @@ TEST_PROGS := \ ipv6_flowlabel.sh \ ipv6_force_forwarding.sh \ ipv6_route_update_soft_lockup.sh \ + ipvtap_test.sh \ l2_tos_ttl_inherit.sh \ l2tp.sh \ link_netns.py \ diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 1e1f253118f5..b84362b9b508 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -48,6 +48,7 @@ CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPV6_SIT=y CONFIG_IPV6_VTI=y CONFIG_IPVLAN=m +CONFIG_IPVTAP=m CONFIG_KALLSYMS=y CONFIG_L2TP=m CONFIG_L2TP_ETH=m @@ -116,6 +117,7 @@ CONFIG_PROC_SYSCTL=y CONFIG_PSAMPLE=m CONFIG_RPS=y CONFIG_SYSFS=y +CONFIG_TAP=m CONFIG_TCP_MD5SIG=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_TEST_BPF=m diff --git a/tools/testing/selftests/net/ipvtap_test.sh b/tools/testing/selftests/net/ipvtap_test.sh new file mode 100755 index 000000000000..354ca7ce8584 --- /dev/null +++ b/tools/testing/selftests/net/ipvtap_test.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Simple tests for ipvtap + + +# +# The testing environment looks this way: +# +# |------HNS-------| |------PHY-------| +# | veth<----------------->veth | +# |------|--|------| |----------------| +# | | +# | | |-----TST0-------| +# | |------------|----ipvlan | +# | |----------------| +# | +# | |-----TST1-------| +# |---------------|----ipvlan | +# |----------------| +# + +ALL_TESTS=" + test_ip_set +" + +source lib.sh + +DEBUG=0 + +VETH_HOST=vethtst.h +VETH_PHY=vethtst.p + +NS_COUNT=32 +IP_ITERATIONS=1024 +IPSET_TIMEOUT="60s" + +ns_run() { + ns=$1 + shift + if [[ "$ns" == "global" ]]; then + "$@" >/dev/null + else + ip netns exec "$ns" "$@" >/dev/null + fi +} + +test_ip_setup_env() { + setup_ns NS_PHY + setup_ns HST_NS + + # setup simulated other-host (phy) and host itself + ns_run "$HST_NS" ip link add $VETH_HOST type veth peer name $VETH_PHY \ + netns "$NS_PHY" >/dev/null + ns_run "$HST_NS" ip link set $VETH_HOST up + ns_run "$NS_PHY" ip link set $VETH_PHY up + + for ((i=0; i/dev/null + ip a a "fc00::$v/64" dev ipvlan0 2>/dev/null + v=$(rnd) + ip a d "172.25.0.$v/24" dev ipvlan0 2>/dev/null + ip a d "fc00::$v/64" dev ipvlan0 2>/dev/null + done +} + +test_ip_set() { + RET=0 + + trap test_ip_cleanup_env EXIT + + test_ip_setup_env + + declare -A ns_pids + for ((i=0; i Date: Fri, 16 Jan 2026 10:48:55 +0100 Subject: selftests: net: csum: Fix printk format in recv_get_packet_csum_status() Following warning is encountered when building selftests on powerpc/32. CC csum csum.c: In function 'recv_get_packet_csum_status': csum.c:710:50: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'size_t' {aka 'unsigned int'} [-Wformat=] 710 | error(1, 0, "cmsg: len=%lu expected=%lu", | ~~^ | | | long unsigned int | %u 711 | cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); | ~~~~~~~~~~~~ | | | size_t {aka unsigned int} csum.c:710:63: warning: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'unsigned int' [-Wformat=] 710 | error(1, 0, "cmsg: len=%lu expected=%lu", | ~~^ | | | long unsigned int | %u cm->cmsg_len has type __kernel_size_t and CMSG() macro has the type returned by sizeof() which is size_t. size_t is 'unsigned int' on some platforms and 'unsigned long' on other ones so use %zu instead of %lu. The code in question was introduced by commit 91a7de85600d ("selftests/net: add csum offload test"). Signed-off-by: Christophe Leroy (CS GROUP) Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/8b69b40826553c1dd500d9d25e45883744f3f348.1768556791.git.chleroy@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/csum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c index 27437590eeb5..e28884ce3ab3 100644 --- a/tools/testing/selftests/net/lib/csum.c +++ b/tools/testing/selftests/net/lib/csum.c @@ -707,7 +707,7 @@ static uint32_t recv_get_packet_csum_status(struct msghdr *msg) cm->cmsg_level, cm->cmsg_type); if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) - error(1, 0, "cmsg: len=%lu expected=%lu", + error(1, 0, "cmsg: len=%zu expected=%zu", cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); aux = (void *)CMSG_DATA(cm); -- cgit v1.2.3 From 2460f31e6e444a52a4e718e4fe64cff29ffaab05 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 14 Jan 2026 11:02:43 -0500 Subject: selftests/tc-testing: Try to add teql as a child qdisc Add a selftest that attempts to add a teql qdisc as a qfq child. Since teql _must_ be added as a root qdisc, the kernel should reject this. Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260114160243.913069-4-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- .../selftests/tc-testing/tc-tests/qdiscs/teql.json | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json index e5cc31f265f8..0179c57104ad 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json @@ -81,5 +81,30 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP link del dev $DUMMY" ] + }, + { + "id": "124e", + "name": "Try to add teql as a child qdisc", + "category": [ + "qdisc", + "ets", + "tbf" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 15 maxpkt 16384" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1:1 handle 2:1 teql0", + "expExitCode": "2", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:1", + "matchJSON": [], + "teardown": [ + "$TC qdisc del dev $DUMMY root handle 1:" + ] } ] -- cgit v1.2.3 From a5546e18f77c0cb15d434bf5b92647687fe483e3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:48 +0100 Subject: net: Add queue-create operation Add a ynl netdev family operation called queue-create that creates a new queue on a netdevice: name: queue-create attribute-set: queue flags: [admin-perm] do: request: attributes: - ifindex - type - lease reply: &queue-create-op attributes: - id This is a generic operation such that it can be extended for various use cases in future. Right now it is mandatory to specify ifindex, the queue type which is enforced to rx and a lease. The newly created queue id is returned to the caller. A queue from a virtual device can have a lease which refers to another queue from a physical device. This is useful for memory providers and AF_XDP operations which take an ifindex and queue id to allow applications to bind against virtual devices in containers. The lease couples both queues together and allows to proxy the operations from a virtual device in a container to the physical device. In future, the nested lease attribute can be lifted and made optional for other use-cases such as dynamic queue creation for physical netdevs. The lack of lease and the specification of the physical device as an ifindex will imply that we need a real queue to be allocated. Similarly, the queue type enforcement to rx can then be lifted as well to support tx. An early implementation had only driver-specific integration [0], but in order for other virtual devices to reuse, it makes sense to have this as a generic API in core net. For leasing queues, the virtual netdev must have real_num_rx_queue less than num_rx_queues at the time of calling queue-create. The queue-type must be rx as only rx queues are supported for leasing for now. We also enforce that the queue-create ifindex must point to a virtual device, and that the nested lease attribute's ifindex must point to a physical device. The nested lease attribute set contains a netns-id attribute which is currently only intended for dumping as part of the queue-get operation. Also, it is modeled as an s32 type similarly as done elsewhere in the stack. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0] Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-2-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 44 +++++++++++++++++++++++++++++++++ include/uapi/linux/netdev.h | 11 +++++++++ net/core/netdev-genl-gen.c | 20 +++++++++++++++ net/core/netdev-genl-gen.h | 2 ++ net/core/netdev-genl.c | 5 ++++ tools/include/uapi/linux/netdev.h | 11 +++++++++ 6 files changed, 93 insertions(+) (limited to 'tools') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 596c306ce52b..b86db8656eac 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -339,6 +339,15 @@ attribute-sets: doc: XSK information for this queue, if any. type: nest nested-attributes: xsk-info + - + name: lease + doc: | + A queue from a virtual device can have a lease which refers to + another queue from a physical device. This is useful for memory + providers and AF_XDP operations which take an ifindex and queue id + to allow applications to bind against virtual devices in containers. + type: nest + nested-attributes: lease - name: qstats doc: | @@ -537,6 +546,24 @@ attribute-sets: name: id - name: type + - + name: lease + attributes: + - + name: ifindex + doc: The netdev ifindex to lease the queue from. + type: u32 + checks: + min: 1 + - + name: queue + doc: The netdev queue to lease from. + type: nest + nested-attributes: queue-id + - + name: netns-id + doc: The network namespace id of the netdev. + type: s32 - name: dmabuf attributes: @@ -686,6 +713,7 @@ operations: - dmabuf - io-uring - xsk + - lease dump: request: attributes: @@ -797,6 +825,22 @@ operations: reply: attributes: - id + - + name: queue-create + doc: | + Create a new queue for the given netdevice. Whether this operation + is supported depends on the device and the driver. + attribute-set: queue + flags: [admin-perm] + do: + request: + attributes: + - ifindex + - type + - lease + reply: &queue-create-op + attributes: + - id kernel-family: headers: ["net/netdev_netlink.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ba673e81716f..52ba99c019e7 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range }; /* Common nested types */ +const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { + [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), + [NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, }, +}; + const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), @@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; +/* NETDEV_CMD_QUEUE_CREATE - do */ +static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_QUEUE_CREATE, + .doit = netdev_nl_queue_create_doit, + .policy = netdev_queue_create_nl_policy, + .maxattr = NETDEV_A_QUEUE_LEASE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index cffc08517a41..d71b435d72c1 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -14,6 +14,7 @@ #include /* Common nested types */ +extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; @@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 470fabbeacd9..aae75431858d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -1120,6 +1120,11 @@ err_genlmsg_free: return err; } +int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From 61d99ce3dfc2f1ba5bf713093bb3a5faf5ebc2dc Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:00 +0100 Subject: selftests/net: Add bpf skb forwarding program Add nk_forward.bpf.c, a BPF program that forwards skbs matching some IPv6 prefix received on eth0 ifindex to a specified netkit ifindex. This will be needed by netkit container tests. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-14-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/hw/nk_forward.bpf.c | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c new file mode 100644 index 000000000000..86ebfc1445b6 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#define TC_ACT_OK 0 +#define ETH_P_IPV6 0x86DD + +#define ctx_ptr(field) ((void *)(long)(field)) + +#define v6_p64_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ + a.s6_addr32[1] == b.s6_addr32[1]) + +volatile __u32 netkit_ifindex; +volatile __u8 ipv6_prefix[16]; + +SEC("tc/ingress") +int tc_redirect_peer(struct __sk_buff *skb) +{ + void *data_end = ctx_ptr(skb->data_end); + void *data = ctx_ptr(skb->data); + struct in6_addr *peer_addr; + struct ipv6hdr *ip6h; + struct ethhdr *eth; + + peer_addr = (struct in6_addr *)ipv6_prefix; + + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + eth = data; + if ((void *)(eth + 1) > data_end) + return TC_ACT_OK; + + ip6h = data + sizeof(struct ethhdr); + if ((void *)(ip6h + 1) > data_end) + return TC_ACT_OK; + + if (!v6_p64_equal(ip6h->daddr, (*peer_addr))) + return TC_ACT_OK; + + return bpf_redirect_peer(netkit_ifindex, 0); +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6be87fbb27763c2999e1c69bbec1f3a63cf05422 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:01 +0100 Subject: selftests/net: Add env for container based tests Add an env NetDrvContEnv for container based selftests. This automates the setup of a netns, netkit pair with one inside the netns, and a BPF program that forwards skbs from the NETIF host inside the container. Currently only netkit is used, but other virtual netdevs e.g. veth can be used too. Expect netkit container datapath selftests to have a publicly routable IP prefix to assign to netkit in a container, such that packets will land on eth0. The BPF skb forward program will then forward such packets from the host netns to the container netns. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-15-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/README.rst | 7 ++ .../selftests/drivers/net/hw/lib/py/__init__.py | 7 +- .../selftests/drivers/net/lib/py/__init__.py | 7 +- tools/testing/selftests/drivers/net/lib/py/env.py | 112 +++++++++++++++++++++ 4 files changed, 127 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index eb838ae94844..b94e81c2e030 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -62,6 +62,13 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 Local and remote endpoint IP addresses. +LOCAL_PREFIX_V4, LOCAL_PREFIX_V6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Local IP prefix/subnet which can be used to allocate extra IP addresses (for +network name spaces behind macvlan, veth, netkit devices). DUT must be +reachable using these addresses from the endpoint. + REMOTE_TYPE ~~~~~~~~~~~ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index d5d247eca6b7..022008249313 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -3,6 +3,7 @@ """ Driver test environment (hardware-only tests). NetDrvEnv and NetDrvEpEnv are the main environment classes. +NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -29,7 +30,7 @@ try: from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner - from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", @@ -44,8 +45,8 @@ try: "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", - "Iperf3Runner"] + "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", + "Remote", "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 8b75faa9af6d..be3a8a936882 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -3,6 +3,7 @@ """ Driver test environment. NetDrvEnv and NetDrvEpEnv are the main environment classes. +NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -43,12 +44,12 @@ try: "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] - from .env import NetDrvEnv, NetDrvEpEnv + from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", - "Iperf3Runner"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", + "Remote", "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 41cc248ac848..5b12c4c59e09 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 +import ipaddress import os +import re import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx @@ -8,6 +10,7 @@ from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev from .remote import Remote +from . import bpftool class NetDrvEnvBase: @@ -289,3 +292,112 @@ class NetDrvEpEnv(NetDrvEnvBase): data.get('stats-block-usecs', 0) / 1000 / 1000 time.sleep(self._stats_settle_time) + + +class NetDrvContEnv(NetDrvEpEnv): + """ + Class for an environment with a netkit pair setup for forwarding traffic + between the physical interface and a network namespace. + """ + + def __init__(self, src_path, nk_rxqueues=1, **kwargs): + super().__init__(src_path, **kwargs) + + self.require_ipver("6") + local_prefix = self.env.get("LOCAL_PREFIX_V6") + if not local_prefix: + raise KsftSkipEx("LOCAL_PREFIX_V6 required") + + local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") + self.ipv6_prefix = f"{local_prefix}::" + self.nk_host_ipv6 = f"{local_prefix}::2:1" + self.nk_guest_ipv6 = f"{local_prefix}::2:2" + + self.netns = None + self._nk_host_ifname = None + self._nk_guest_ifname = None + self._tc_attached = False + self._bpf_prog_pref = None + self._bpf_prog_id = None + + ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") + + all_links = ip("-d link show", json=True) + netkit_links = [link for link in all_links + if link.get('linkinfo', {}).get('info_kind') == 'netkit' + and 'UP' not in link.get('flags', [])] + + if len(netkit_links) != 2: + raise KsftSkipEx("Failed to create netkit pair") + + netkit_links.sort(key=lambda x: x['ifindex']) + self._nk_host_ifname = netkit_links[1]['ifname'] + self._nk_guest_ifname = netkit_links[0]['ifname'] + self.nk_host_ifindex = netkit_links[1]['ifindex'] + self.nk_guest_ifindex = netkit_links[0]['ifindex'] + + self._setup_ns() + self._attach_bpf() + + def __del__(self): + if self._tc_attached: + cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") + self._tc_attached = False + + if self._nk_host_ifname: + cmd(f"ip link del dev {self._nk_host_ifname}") + self._nk_host_ifname = None + self._nk_guest_ifname = None + + if self.netns: + del self.netns + self.netns = None + + super().__del__() + + def _setup_ns(self): + self.netns = NetNS() + ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") + ip(f"link set dev {self._nk_host_ifname} up") + ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") + ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") + + ip("link set lo up", ns=self.netns) + ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) + ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) + ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) + ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) + + def _attach_bpf(self): + bpf_obj = self.test_dir / "nk_forward.bpf.o" + if not bpf_obj.exists(): + raise KsftSkipEx("BPF prog not found") + + cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action") + self._tc_attached = True + + tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout + match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info) + if not match: + raise Exception("Failed to get BPF prog ID") + self._bpf_prog_pref = int(match.group(1)) + self._bpf_prog_id = int(match.group(2)) + + prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) + map_ids = prog_info.get("map_ids", []) + + bss_map_id = None + for map_id in map_ids: + map_info = bpftool(f"map show id {map_id}", json=True) + if map_info.get("name").endswith("bss"): + bss_map_id = map_id + + if bss_map_id is None: + raise Exception("Failed to find .bss map") + + ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) + ipv6_bytes = ipv6_addr.packed + ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little') + value = ipv6_bytes + ifindex_bytes + value_hex = ' '.join(f'{b:02x}' for b in value) + bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") -- cgit v1.2.3 From ab771c938d9a57d510bb70c565c9388b10494090 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:02 +0100 Subject: selftests/net: Make NetDrvContEnv support queue leasing Add a new parameter `lease` to NetDrvContEnv that sets up queue leasing in the env. The NETIF also has some ethtool parameters changed to support memory provider tests. This is needed in NetDrvContEnv rather than individual test cases since the cleanup to restore NETIF can't be done, until the netns in the env is gone. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-16-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/lib/py/env.py | 47 ++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 5b12c4c59e09..7066d78395c6 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -9,6 +9,7 @@ from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev +from lib.py import NetdevFamily, EthtoolFamily from .remote import Remote from . import bpftool @@ -300,7 +301,7 @@ class NetDrvContEnv(NetDrvEpEnv): between the physical interface and a network namespace. """ - def __init__(self, src_path, nk_rxqueues=1, **kwargs): + def __init__(self, src_path, lease=False, **kwargs): super().__init__(src_path, **kwargs) self.require_ipver("6") @@ -308,6 +309,9 @@ class NetDrvContEnv(NetDrvEpEnv): if not local_prefix: raise KsftSkipEx("LOCAL_PREFIX_V6 required") + self.netdevnl = NetdevFamily() + self.ethnl = EthtoolFamily() + local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") self.ipv6_prefix = f"{local_prefix}::" self.nk_host_ipv6 = f"{local_prefix}::2:1" @@ -319,7 +323,11 @@ class NetDrvContEnv(NetDrvEpEnv): self._tc_attached = False self._bpf_prog_pref = None self._bpf_prog_id = None + self._leased = False + nk_rxqueues = 1 + if lease: + nk_rxqueues = 2 ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") all_links = ip("-d link show", json=True) @@ -336,6 +344,9 @@ class NetDrvContEnv(NetDrvEpEnv): self.nk_host_ifindex = netkit_links[1]['ifindex'] self.nk_guest_ifindex = netkit_links[0]['ifindex'] + if lease: + self._lease_queues() + self._setup_ns() self._attach_bpf() @@ -353,8 +364,42 @@ class NetDrvContEnv(NetDrvEpEnv): del self.netns self.netns = None + if self._leased: + self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': self._hds_thresh, + 'rx': self._rx_rings}) + self._leased = False + super().__del__() + def _lease_queues(self): + channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') + + rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}}) + self._rx_rings = rings['rx'] + self._hds_thresh = rings.get('hds-thresh', 0) + self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + self.src_queue = channels - 1 + bind_result = self.netdevnl.queue_create( + { + "ifindex": self.nk_guest_ifindex, + "type": "rx", + "lease": { + "ifindex": self.ifindex, + "queue": {"id": self.src_queue, "type": "rx"}, + }, + } + ) + self.nk_queue = bind_result['id'] + self._leased = True + def _setup_ns(self): self.netns = NetNS() ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") -- cgit v1.2.3 From 931420a2fc363817c92990fa14eb1bdec024ce04 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:03 +0100 Subject: selftests/net: Add netkit container tests Add two tests using NetDrvContEnv. One basic test that sets up a netkit pair, with one end in a netns. Use LOCAL_PREFIX_V6 and nk_forward BPF program to ping from a remote host to the netkit in netns. Second is a selftest for netkit queue leasing, using io_uring zero copy test binary inside of a netns with netkit. This checks that memory providers can be bound against virtual queues in a netkit within a netns that are leasing from a physical netdev in the default netns. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-17-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/hw/Makefile | 2 + tools/testing/selftests/drivers/net/hw/nk_netns.py | 23 +++++++++ .../testing/selftests/drivers/net/hw/nk_qlease.py | 55 ++++++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py create mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 9c163ba6feee..39ad86d693b3 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -32,6 +32,8 @@ TEST_PROGS = \ irq.py \ loopback.sh \ nic_timestamp.py \ + nk_netns.py \ + nk_qlease.py \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py new file mode 100755 index 000000000000..afa8638195d8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_netns.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvContEnv +from lib.py import cmd + + +def test_ping(cfg) -> None: + cfg.require_ipver("6") + + cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote) + cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns) + + +def main() -> None: + with NetDrvContEnv(__file__) as cfg: + ksft_run([test_ping], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py new file mode 100755 index 000000000000..738a46d2d20c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_qlease.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import re +from os import path +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvContEnv +from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen + + +def create_rss_ctx(cfg): + output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout + values = re.search(r'New RSS context is (\d+)', output).group(1) + return int(values) + + +def set_flow_rule(cfg): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + +def set_flow_rule_rss(cfg, rss_ctx_id): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + +def test_iou_zcrx(cfg) -> None: + cfg.require_ipver('6') + + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = set_flow_rule(cfg) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) + cmd(tx_cmd, host=cfg.remote) + + +def main() -> None: + with NetDrvContEnv(__file__, lease=True) as cfg: + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + cfg.port = rand_port() + ksft_run([test_iou_zcrx], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 5d54aa40c7b7e9dee5746cca99e9ddbcca13e895 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 16 Jan 2026 09:52:36 +0100 Subject: vsock/test: Do not filter kallsyms by symbol type Blamed commit implemented logic to discover available vsock transports by grepping /proc/kallsyms for known symbols. It incorrectly filtered entries by type 'd'. For some kernel configs having CONFIG_VIRTIO_VSOCKETS=m CONFIG_VSOCKETS_LOOPBACK=y kallsyms reports 0000000000000000 d virtio_transport [vmw_vsock_virtio_transport] 0000000000000000 t loopback_transport Overzealous filtering might have affected vsock test suit, resulting in insufficient/misleading testing. Do not filter symbols by type. It never helped much. Fixes: 3070c05b7afd ("vsock/test: Introduce get_transports()") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20260116-vsock_test-kallsyms-grep-v1-1-3320bc3346f2@rbox.co Signed-off-by: Paolo Abeni --- tools/testing/vsock/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index 142c02a6834a..bf633cde82b0 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -25,7 +25,7 @@ enum transport { }; static const char * const transport_ksyms[] = { - #define x(name, symbol) "d " symbol "_transport", + #define x(name, symbol) " " symbol "_transport", KNOWN_TRANSPORTS(x) #undef x }; -- cgit v1.2.3 From 92d65d9c31621befe0a5f7c0bd43bd217613c6b6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:27 -0800 Subject: perf symbol-elf: Fix leak of ELF files with GNU debugdata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The processing of DSO_BINARY_TYPE__GNU_DEBUGDATA in symsrc__init happens with an open ELF file but the error path only closes the associate fd. Fix the goto so that the ELF file is also ended and memory released. Fixes: b10f74308e130527 ("perf symbol: Support .gnu_debugdata for symbols") Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index b8fea12997a0..76912c62b6a0 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1173,7 +1173,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, Elf *embedded = read_gnu_debugdata(dso, elf, name, &new_fd); if (!embedded) - goto out_close; + goto out_elf_end; elf_end(elf); close(fd); -- cgit v1.2.3 From e99d544c7f3691eb321c88fdbadf04b777c114c4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:28 -0800 Subject: perf dso: Extra validity checks that e_machine is valid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Better ensure a read e_machine is valid by checking the file appears like an ELF file and the read e_machine value is less than EM_NUM. This better avoids spurious e_machine values when looking for an e_machine in say a thread. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 18e656712f5a..143720d1ecb1 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1236,17 +1236,28 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine) try_to_open_dso(dso, machine); fd = dso__data(dso)->fd; if (fd >= 0) { - _Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset"); - _Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset"); - if (dso__needs_swap(dso) == DSO_SWAP__UNSET) { - unsigned char eidata; - - if (pread(fd, &eidata, sizeof(eidata), EI_DATA) == sizeof(eidata)) - dso__swap_init(dso, eidata); + unsigned char e_ident[EI_NIDENT]; + + _Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset"); + _Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset"); + if (pread(fd, &e_ident, sizeof(e_ident), 0) == sizeof(e_ident) && + memcmp(e_ident, ELFMAG, SELFMAG) == 0 && + e_ident[EI_CLASS] > ELFCLASSNONE && e_ident[EI_CLASS] < ELFCLASSNUM && + e_ident[EI_DATA] > ELFDATANONE && e_ident[EI_DATA] < ELFDATANUM && + e_ident[EI_VERSION] == EV_CURRENT) { + _Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset"); + _Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset"); + + if (dso__needs_swap(dso) == DSO_SWAP__UNSET) + dso__swap_init(dso, e_ident[EI_DATA]); + + if (dso__needs_swap(dso) != DSO_SWAP__UNSET && + pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine) && + e_machine < EM_NUM) + e_machine = DSO__SWAP(dso, uint16_t, e_machine); + else + e_machine = EM_NONE; } - if (dso__needs_swap(dso) != DSO_SWAP__UNSET && - pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine)) - e_machine = DSO__SWAP(dso, uint16_t, e_machine); } mutex_unlock(dso__data_open_lock()); return e_machine; -- cgit v1.2.3 From 86f3801208ed1632ddd75a8e95ade5e433567be1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:29 -0800 Subject: perf record: Disable inline frames when marking build IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marking DSOs doesn't need inline frames traversing as the inline frames are all part of the same DSO. Disable to improve performance and also to avoid potential issues with dwarf information. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 003e47a4fc1d..663ca3a03396 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1509,6 +1509,8 @@ static int process_buildids(struct record *rec) if (perf_data__size(&rec->data) == 0) return 0; + /* A single DSO is needed and not all inline frames. */ + symbol_conf.inline_name = false; /* * During this process, it'll load kernel map and replace the * dso->long_name to a real pathname it found. In this case @@ -1519,7 +1521,6 @@ static int process_buildids(struct record *rec) * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 */ symbol_conf.ignore_vmlinux_buildid = true; - /* * If --buildid-all is given, it marks all DSO regardless of hits, * so no need to process samples. But if timestamp_boundary is enabled, -- cgit v1.2.3 From e62fae9d9e85d38cdda1ee08a424e1b5b8246620 Mon Sep 17 00:00:00 2001 From: Shimin Guo Date: Fri, 16 Jan 2026 21:28:30 -0800 Subject: perf unwind-libdw: Fix a cross-arch unwinding bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The set_initial_registers field of Dwfl_Thread_Callbacks needs to be set according to the arch of the stack samples being analyzed, not the arch that perf itself is built for. Currently perf fails to unwind stack samples collected from archs different from that of the host perf is running on. This patch moves the arch-specific implementations of set_initial_registers from tools/perf/arch to tools/perf/utli/unwind-libdw-arch, similar to the way the perf-regs-arch folder contains arch-specific functions related to registers, and chooses the implementation based on the arch of the data being processed. Reviewed-by: Ian Rogers Signed-off-by: Shimin Guo Acked-by: Namhyung Kim Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/Build | 1 - tools/perf/arch/arm/util/unwind-libdw.c | 39 ----------- tools/perf/arch/arm64/util/Build | 1 - tools/perf/arch/arm64/util/unwind-libdw.c | 61 ----------------- tools/perf/arch/csky/util/Build | 2 - tools/perf/arch/csky/util/unwind-libdw.c | 78 ---------------------- tools/perf/arch/loongarch/util/unwind-libdw.c | 57 ---------------- tools/perf/arch/powerpc/util/Build | 1 - tools/perf/arch/powerpc/util/unwind-libdw.c | 76 --------------------- tools/perf/arch/riscv/util/Build | 1 - tools/perf/arch/riscv/util/unwind-libdw.c | 58 ---------------- tools/perf/arch/s390/util/Build | 2 - tools/perf/arch/s390/util/unwind-libdw.c | 65 ------------------ tools/perf/arch/x86/util/Build | 1 - tools/perf/arch/x86/util/unwind-libdw.c | 54 --------------- tools/perf/util/Build | 1 + tools/perf/util/unwind-libdw-arch/Build | 8 +++ .../perf/util/unwind-libdw-arch/unwind-libdw-arm.c | 39 +++++++++++ .../util/unwind-libdw-arch/unwind-libdw-arm64.c | 61 +++++++++++++++++ .../util/unwind-libdw-arch/unwind-libdw-csky.c | 78 ++++++++++++++++++++++ .../unwind-libdw-arch/unwind-libdw-loongarch.c | 57 ++++++++++++++++ .../util/unwind-libdw-arch/unwind-libdw-powerpc.c | 76 +++++++++++++++++++++ .../util/unwind-libdw-arch/unwind-libdw-riscv.c | 58 ++++++++++++++++ .../util/unwind-libdw-arch/unwind-libdw-s390.c | 65 ++++++++++++++++++ .../perf/util/unwind-libdw-arch/unwind-libdw-x86.c | 54 +++++++++++++++ tools/perf/util/unwind-libdw.c | 51 ++++++++++++-- tools/perf/util/unwind-libdw.h | 10 ++- 27 files changed, 551 insertions(+), 504 deletions(-) delete mode 100644 tools/perf/arch/arm/util/unwind-libdw.c delete mode 100644 tools/perf/arch/arm64/util/unwind-libdw.c delete mode 100644 tools/perf/arch/csky/util/unwind-libdw.c delete mode 100644 tools/perf/arch/loongarch/util/unwind-libdw.c delete mode 100644 tools/perf/arch/powerpc/util/unwind-libdw.c delete mode 100644 tools/perf/arch/riscv/util/unwind-libdw.c delete mode 100644 tools/perf/arch/s390/util/unwind-libdw.c delete mode 100644 tools/perf/arch/x86/util/unwind-libdw.c create mode 100644 tools/perf/util/unwind-libdw-arch/Build create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c (limited to 'tools') diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build index fd695e1fdaee..3291f893b943 100644 --- a/tools/perf/arch/arm/util/Build +++ b/tools/perf/arch/arm/util/Build @@ -1,6 +1,5 @@ perf-util-y += perf_regs.o perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-util-y += pmu.o auxtrace.o cs-etm.o diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c deleted file mode 100644 index fbb643f224ec..000000000000 --- a/tools/perf/arch/arm/util/unwind-libdw.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "perf_regs.h" -#include "../../../util/unwind-libdw.h" -#include "../../../util/perf_regs.h" -#include "../../../util/sample.h" - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_ARM_##r); \ - val; \ -}) - - dwarf_regs[0] = REG(R0); - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(FP); - dwarf_regs[12] = REG(IP); - dwarf_regs[13] = REG(SP); - dwarf_regs[14] = REG(LR); - dwarf_regs[15] = REG(PC); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX, - dwarf_regs); -} diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index d63881081d2e..0177af19cc00 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -1,4 +1,3 @@ -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o perf-util-y += ../../arm/util/auxtrace.o diff --git a/tools/perf/arch/arm64/util/unwind-libdw.c b/tools/perf/arch/arm64/util/unwind-libdw.c deleted file mode 100644 index b89b0a7e5ad9..000000000000 --- a/tools/perf/arch/arm64/util/unwind-libdw.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "perf_regs.h" -#include "../../../util/unwind-libdw.h" -#include "../../../util/perf_regs.h" -#include "../../../util/sample.h" - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r); \ - val; \ -}) - - dwarf_regs[0] = REG(X0); - dwarf_regs[1] = REG(X1); - dwarf_regs[2] = REG(X2); - dwarf_regs[3] = REG(X3); - dwarf_regs[4] = REG(X4); - dwarf_regs[5] = REG(X5); - dwarf_regs[6] = REG(X6); - dwarf_regs[7] = REG(X7); - dwarf_regs[8] = REG(X8); - dwarf_regs[9] = REG(X9); - dwarf_regs[10] = REG(X10); - dwarf_regs[11] = REG(X11); - dwarf_regs[12] = REG(X12); - dwarf_regs[13] = REG(X13); - dwarf_regs[14] = REG(X14); - dwarf_regs[15] = REG(X15); - dwarf_regs[16] = REG(X16); - dwarf_regs[17] = REG(X17); - dwarf_regs[18] = REG(X18); - dwarf_regs[19] = REG(X19); - dwarf_regs[20] = REG(X20); - dwarf_regs[21] = REG(X21); - dwarf_regs[22] = REG(X22); - dwarf_regs[23] = REG(X23); - dwarf_regs[24] = REG(X24); - dwarf_regs[25] = REG(X25); - dwarf_regs[26] = REG(X26); - dwarf_regs[27] = REG(X27); - dwarf_regs[28] = REG(X28); - dwarf_regs[29] = REG(X29); - dwarf_regs[30] = REG(LR); - dwarf_regs[31] = REG(SP); - - if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX, - dwarf_regs)) - return false; - - dwarf_pc = REG(PC); - dwfl_thread_state_register_pc(thread, dwarf_pc); - - return true; -} diff --git a/tools/perf/arch/csky/util/Build b/tools/perf/arch/csky/util/Build index 5e6ea82c4202..6b2d0e021b11 100644 --- a/tools/perf/arch/csky/util/Build +++ b/tools/perf/arch/csky/util/Build @@ -1,3 +1 @@ perf-util-y += perf_regs.o - -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/csky/util/unwind-libdw.c b/tools/perf/arch/csky/util/unwind-libdw.c deleted file mode 100644 index b20b1569783d..000000000000 --- a/tools/perf/arch/csky/util/unwind-libdw.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. - -#include -#include "perf_regs.h" -#include "../../util/unwind-libdw.h" -#include "../../util/perf_regs.h" -#include "../../util/event.h" - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r); \ - val; \ -}) - -#if defined(__CSKYABIV2__) - dwarf_regs[0] = REG(A0); - dwarf_regs[1] = REG(A1); - dwarf_regs[2] = REG(A2); - dwarf_regs[3] = REG(A3); - dwarf_regs[4] = REG(REGS0); - dwarf_regs[5] = REG(REGS1); - dwarf_regs[6] = REG(REGS2); - dwarf_regs[7] = REG(REGS3); - dwarf_regs[8] = REG(REGS4); - dwarf_regs[9] = REG(REGS5); - dwarf_regs[10] = REG(REGS6); - dwarf_regs[11] = REG(REGS7); - dwarf_regs[12] = REG(REGS8); - dwarf_regs[13] = REG(REGS9); - dwarf_regs[14] = REG(SP); - dwarf_regs[15] = REG(LR); - dwarf_regs[16] = REG(EXREGS0); - dwarf_regs[17] = REG(EXREGS1); - dwarf_regs[18] = REG(EXREGS2); - dwarf_regs[19] = REG(EXREGS3); - dwarf_regs[20] = REG(EXREGS4); - dwarf_regs[21] = REG(EXREGS5); - dwarf_regs[22] = REG(EXREGS6); - dwarf_regs[23] = REG(EXREGS7); - dwarf_regs[24] = REG(EXREGS8); - dwarf_regs[25] = REG(EXREGS9); - dwarf_regs[26] = REG(EXREGS10); - dwarf_regs[27] = REG(EXREGS11); - dwarf_regs[28] = REG(EXREGS12); - dwarf_regs[29] = REG(EXREGS13); - dwarf_regs[30] = REG(EXREGS14); - dwarf_regs[31] = REG(TLS); - dwarf_regs[32] = REG(PC); -#else - dwarf_regs[0] = REG(SP); - dwarf_regs[1] = REG(REGS9); - dwarf_regs[2] = REG(A0); - dwarf_regs[3] = REG(A1); - dwarf_regs[4] = REG(A2); - dwarf_regs[5] = REG(A3); - dwarf_regs[6] = REG(REGS0); - dwarf_regs[7] = REG(REGS1); - dwarf_regs[8] = REG(REGS2); - dwarf_regs[9] = REG(REGS3); - dwarf_regs[10] = REG(REGS4); - dwarf_regs[11] = REG(REGS5); - dwarf_regs[12] = REG(REGS6); - dwarf_regs[13] = REG(REGS7); - dwarf_regs[14] = REG(REGS8); - dwarf_regs[15] = REG(LR); -#endif - dwfl_thread_state_register_pc(thread, REG(PC)); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX, - dwarf_regs); -} diff --git a/tools/perf/arch/loongarch/util/unwind-libdw.c b/tools/perf/arch/loongarch/util/unwind-libdw.c deleted file mode 100644 index 60b1144bedd5..000000000000 --- a/tools/perf/arch/loongarch/util/unwind-libdw.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ - -#include -#include "perf_regs.h" -#include "../../util/unwind-libdw.h" -#include "../../util/perf_regs.h" -#include "../../util/sample.h" - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r); \ - val; \ -}) - - dwarf_regs[0] = 0; - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - dwarf_regs[16] = REG(R16); - dwarf_regs[17] = REG(R17); - dwarf_regs[18] = REG(R18); - dwarf_regs[19] = REG(R19); - dwarf_regs[20] = REG(R20); - dwarf_regs[21] = REG(R21); - dwarf_regs[22] = REG(R22); - dwarf_regs[23] = REG(R23); - dwarf_regs[24] = REG(R24); - dwarf_regs[25] = REG(R25); - dwarf_regs[26] = REG(R26); - dwarf_regs[27] = REG(R27); - dwarf_regs[28] = REG(R28); - dwarf_regs[29] = REG(R29); - dwarf_regs[30] = REG(R30); - dwarf_regs[31] = REG(R31); - dwfl_thread_state_register_pc(thread, REG(PC)); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs); -} diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index 3d0d5427aef7..5fd28ec713a4 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -9,5 +9,4 @@ perf-util-y += evsel.o perf-util-$(CONFIG_LIBDW) += skip-callchain-idx.o perf-util-$(CONFIG_LIBUNWIND) += unwind-libunwind.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-util-y += auxtrace.o diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c deleted file mode 100644 index 82d0c28ae345..000000000000 --- a/tools/perf/arch/powerpc/util/unwind-libdw.c +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "perf_regs.h" -#include "../../../util/unwind-libdw.h" -#include "../../../util/perf_regs.h" -#include "../../../util/sample.h" - -/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils. */ -static const int special_regs[3][2] = { - { 65, PERF_REG_POWERPC_LINK }, - { 101, PERF_REG_POWERPC_XER }, - { 109, PERF_REG_POWERPC_CTR }, -}; - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[32], dwarf_nip; - size_t i; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r); \ - val; \ -}) - - dwarf_regs[0] = REG(R0); - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - dwarf_regs[16] = REG(R16); - dwarf_regs[17] = REG(R17); - dwarf_regs[18] = REG(R18); - dwarf_regs[19] = REG(R19); - dwarf_regs[20] = REG(R20); - dwarf_regs[21] = REG(R21); - dwarf_regs[22] = REG(R22); - dwarf_regs[23] = REG(R23); - dwarf_regs[24] = REG(R24); - dwarf_regs[25] = REG(R25); - dwarf_regs[26] = REG(R26); - dwarf_regs[27] = REG(R27); - dwarf_regs[28] = REG(R28); - dwarf_regs[29] = REG(R29); - dwarf_regs[30] = REG(R30); - dwarf_regs[31] = REG(R31); - if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs)) - return false; - - dwarf_nip = REG(NIP); - dwfl_thread_state_register_pc(thread, dwarf_nip); - for (i = 0; i < ARRAY_SIZE(special_regs); i++) { - Dwarf_Word val = 0; - perf_reg_value(&val, user_regs, special_regs[i][1]); - if (!dwfl_thread_state_registers(thread, - special_regs[i][0], 1, - &val)) - return false; - } - - return true; -} diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build index 58a672246024..628b9ebd418b 100644 --- a/tools/perf/arch/riscv/util/Build +++ b/tools/perf/arch/riscv/util/Build @@ -2,4 +2,3 @@ perf-util-y += perf_regs.o perf-util-y += header.o perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/riscv/util/unwind-libdw.c b/tools/perf/arch/riscv/util/unwind-libdw.c deleted file mode 100644 index dc1476e16321..000000000000 --- a/tools/perf/arch/riscv/util/unwind-libdw.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */ - -#include -#include "perf_regs.h" -#include "../../util/unwind-libdw.h" -#include "../../util/perf_regs.h" -#include "../../util/sample.h" - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[32]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r); \ - val; \ -}) - - dwarf_regs[0] = 0; - dwarf_regs[1] = REG(RA); - dwarf_regs[2] = REG(SP); - dwarf_regs[3] = REG(GP); - dwarf_regs[4] = REG(TP); - dwarf_regs[5] = REG(T0); - dwarf_regs[6] = REG(T1); - dwarf_regs[7] = REG(T2); - dwarf_regs[8] = REG(S0); - dwarf_regs[9] = REG(S1); - dwarf_regs[10] = REG(A0); - dwarf_regs[11] = REG(A1); - dwarf_regs[12] = REG(A2); - dwarf_regs[13] = REG(A3); - dwarf_regs[14] = REG(A4); - dwarf_regs[15] = REG(A5); - dwarf_regs[16] = REG(A6); - dwarf_regs[17] = REG(A7); - dwarf_regs[18] = REG(S2); - dwarf_regs[19] = REG(S3); - dwarf_regs[20] = REG(S4); - dwarf_regs[21] = REG(S5); - dwarf_regs[22] = REG(S6); - dwarf_regs[23] = REG(S7); - dwarf_regs[24] = REG(S8); - dwarf_regs[25] = REG(S9); - dwarf_regs[26] = REG(S10); - dwarf_regs[27] = REG(S11); - dwarf_regs[28] = REG(T3); - dwarf_regs[29] = REG(T4); - dwarf_regs[30] = REG(T5); - dwarf_regs[31] = REG(T6); - dwfl_thread_state_register_pc(thread, REG(PC)); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX, - dwarf_regs); -} diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build index c64eb18dbdae..5391d26fedd4 100644 --- a/tools/perf/arch/s390/util/Build +++ b/tools/perf/arch/s390/util/Build @@ -2,8 +2,6 @@ perf-util-y += header.o perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o perf-util-y += perf_regs.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o - perf-util-y += machine.o perf-util-y += pmu.o diff --git a/tools/perf/arch/s390/util/unwind-libdw.c b/tools/perf/arch/s390/util/unwind-libdw.c deleted file mode 100644 index c27c7a0d1076..000000000000 --- a/tools/perf/arch/s390/util/unwind-libdw.c +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include "../../util/unwind-libdw.h" -#include "../../util/perf_regs.h" -#include "../../util/event.h" -#include "../../util/sample.h" -#include "dwarf-regs-table.h" -#include "perf_regs.h" - - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_S390_##r); \ - val; \ -}) - /* - * For DWARF register mapping details, - * see also perf/arch/s390/include/dwarf-regs-table.h - */ - dwarf_regs[0] = REG(R0); - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - - dwarf_regs[16] = REG(FP0); - dwarf_regs[17] = REG(FP2); - dwarf_regs[18] = REG(FP4); - dwarf_regs[19] = REG(FP6); - dwarf_regs[20] = REG(FP1); - dwarf_regs[21] = REG(FP3); - dwarf_regs[22] = REG(FP5); - dwarf_regs[23] = REG(FP7); - dwarf_regs[24] = REG(FP8); - dwarf_regs[25] = REG(FP10); - dwarf_regs[26] = REG(FP12); - dwarf_regs[27] = REG(FP14); - dwarf_regs[28] = REG(FP9); - dwarf_regs[29] = REG(FP11); - dwarf_regs[30] = REG(FP13); - dwarf_regs[31] = REG(FP15); - - dwarf_regs[64] = REG(MASK); - dwarf_regs[65] = REG(PC); - - dwfl_thread_state_register_pc(thread, dwarf_regs[65]); - return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs); -} diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index c0dc5965f362..fad256252bb9 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -12,7 +12,6 @@ perf-util-y += evsel.o perf-util-y += iostat.o perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-util-y += auxtrace.o perf-util-y += archinsn.o diff --git a/tools/perf/arch/x86/util/unwind-libdw.c b/tools/perf/arch/x86/util/unwind-libdw.c deleted file mode 100644 index 798493e887d7..000000000000 --- a/tools/perf/arch/x86/util/unwind-libdw.c +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "perf_regs.h" -#include "../../../util/unwind-libdw.h" -#include "../../../util/perf_regs.h" -#include "util/sample.h" - -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[17]; - unsigned nregs; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_X86_##r); \ - val; \ -}) - - if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) { - dwarf_regs[0] = REG(AX); - dwarf_regs[1] = REG(CX); - dwarf_regs[2] = REG(DX); - dwarf_regs[3] = REG(BX); - dwarf_regs[4] = REG(SP); - dwarf_regs[5] = REG(BP); - dwarf_regs[6] = REG(SI); - dwarf_regs[7] = REG(DI); - dwarf_regs[8] = REG(IP); - nregs = 9; - } else { - dwarf_regs[0] = REG(AX); - dwarf_regs[1] = REG(DX); - dwarf_regs[2] = REG(CX); - dwarf_regs[3] = REG(BX); - dwarf_regs[4] = REG(SI); - dwarf_regs[5] = REG(DI); - dwarf_regs[6] = REG(BP); - dwarf_regs[7] = REG(SP); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - dwarf_regs[16] = REG(IP); - nregs = 17; - } - - return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs); -} diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 4915f237ba9e..5efec73be474 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -227,6 +227,7 @@ perf-util-$(CONFIG_LIBDW) += annotate-data.o perf-util-$(CONFIG_LIBDW) += libdw.o perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw-arch/ perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind-local.o perf-util-$(CONFIG_LIBUNWIND) += unwind-libunwind.o perf-util-$(CONFIG_LIBUNWIND_X86) += libunwind/x86_32.o diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build new file mode 100644 index 000000000000..ef17a83a7813 --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -0,0 +1,8 @@ +perf-util-y += unwind-libdw-x86.o +perf-util-y += unwind-libdw-arm.o +perf-util-y += unwind-libdw-arm64.o +perf-util-y += unwind-libdw-csky.o +perf-util-y += unwind-libdw-loongarch.o +perf-util-y += unwind-libdw-powerpc.o +perf-util-y += unwind-libdw-riscv.o +perf-util-y += unwind-libdw-s390.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c new file mode 100644 index 000000000000..56e9b5975bcc --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "../arch/arm/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX]; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_ARM_##r); \ + val; \ +}) + + dwarf_regs[0] = REG(R0); + dwarf_regs[1] = REG(R1); + dwarf_regs[2] = REG(R2); + dwarf_regs[3] = REG(R3); + dwarf_regs[4] = REG(R4); + dwarf_regs[5] = REG(R5); + dwarf_regs[6] = REG(R6); + dwarf_regs[7] = REG(R7); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(FP); + dwarf_regs[12] = REG(IP); + dwarf_regs[13] = REG(SP); + dwarf_regs[14] = REG(LR); + dwarf_regs[15] = REG(PC); + + return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX, + dwarf_regs); +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c new file mode 100644 index 000000000000..29b6833e036c --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "../arch/arm64/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r); \ + val; \ +}) + + dwarf_regs[0] = REG(X0); + dwarf_regs[1] = REG(X1); + dwarf_regs[2] = REG(X2); + dwarf_regs[3] = REG(X3); + dwarf_regs[4] = REG(X4); + dwarf_regs[5] = REG(X5); + dwarf_regs[6] = REG(X6); + dwarf_regs[7] = REG(X7); + dwarf_regs[8] = REG(X8); + dwarf_regs[9] = REG(X9); + dwarf_regs[10] = REG(X10); + dwarf_regs[11] = REG(X11); + dwarf_regs[12] = REG(X12); + dwarf_regs[13] = REG(X13); + dwarf_regs[14] = REG(X14); + dwarf_regs[15] = REG(X15); + dwarf_regs[16] = REG(X16); + dwarf_regs[17] = REG(X17); + dwarf_regs[18] = REG(X18); + dwarf_regs[19] = REG(X19); + dwarf_regs[20] = REG(X20); + dwarf_regs[21] = REG(X21); + dwarf_regs[22] = REG(X22); + dwarf_regs[23] = REG(X23); + dwarf_regs[24] = REG(X24); + dwarf_regs[25] = REG(X25); + dwarf_regs[26] = REG(X26); + dwarf_regs[27] = REG(X27); + dwarf_regs[28] = REG(X28); + dwarf_regs[29] = REG(X29); + dwarf_regs[30] = REG(LR); + dwarf_regs[31] = REG(SP); + + if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX, + dwarf_regs)) + return false; + + dwarf_pc = REG(PC); + dwfl_thread_state_register_pc(thread, dwarf_pc); + + return true; +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c new file mode 100644 index 000000000000..2556d034c32a --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. + +#include +#include "../arch/csky/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX]; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r); \ + val; \ +}) + +#if defined(__CSKYABIV2__) + dwarf_regs[0] = REG(A0); + dwarf_regs[1] = REG(A1); + dwarf_regs[2] = REG(A2); + dwarf_regs[3] = REG(A3); + dwarf_regs[4] = REG(REGS0); + dwarf_regs[5] = REG(REGS1); + dwarf_regs[6] = REG(REGS2); + dwarf_regs[7] = REG(REGS3); + dwarf_regs[8] = REG(REGS4); + dwarf_regs[9] = REG(REGS5); + dwarf_regs[10] = REG(REGS6); + dwarf_regs[11] = REG(REGS7); + dwarf_regs[12] = REG(REGS8); + dwarf_regs[13] = REG(REGS9); + dwarf_regs[14] = REG(SP); + dwarf_regs[15] = REG(LR); + dwarf_regs[16] = REG(EXREGS0); + dwarf_regs[17] = REG(EXREGS1); + dwarf_regs[18] = REG(EXREGS2); + dwarf_regs[19] = REG(EXREGS3); + dwarf_regs[20] = REG(EXREGS4); + dwarf_regs[21] = REG(EXREGS5); + dwarf_regs[22] = REG(EXREGS6); + dwarf_regs[23] = REG(EXREGS7); + dwarf_regs[24] = REG(EXREGS8); + dwarf_regs[25] = REG(EXREGS9); + dwarf_regs[26] = REG(EXREGS10); + dwarf_regs[27] = REG(EXREGS11); + dwarf_regs[28] = REG(EXREGS12); + dwarf_regs[29] = REG(EXREGS13); + dwarf_regs[30] = REG(EXREGS14); + dwarf_regs[31] = REG(TLS); + dwarf_regs[32] = REG(PC); +#else + dwarf_regs[0] = REG(SP); + dwarf_regs[1] = REG(REGS9); + dwarf_regs[2] = REG(A0); + dwarf_regs[3] = REG(A1); + dwarf_regs[4] = REG(A2); + dwarf_regs[5] = REG(A3); + dwarf_regs[6] = REG(REGS0); + dwarf_regs[7] = REG(REGS1); + dwarf_regs[8] = REG(REGS2); + dwarf_regs[9] = REG(REGS3); + dwarf_regs[10] = REG(REGS4); + dwarf_regs[11] = REG(REGS5); + dwarf_regs[12] = REG(REGS6); + dwarf_regs[13] = REG(REGS7); + dwarf_regs[14] = REG(REGS8); + dwarf_regs[15] = REG(LR); +#endif + dwfl_thread_state_register_pc(thread, REG(PC)); + + return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX, + dwarf_regs); +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c new file mode 100644 index 000000000000..5fca673508be --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ + +#include +#include "../arch/loongarch/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX]; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r); \ + val; \ +}) + + dwarf_regs[0] = 0; + dwarf_regs[1] = REG(R1); + dwarf_regs[2] = REG(R2); + dwarf_regs[3] = REG(R3); + dwarf_regs[4] = REG(R4); + dwarf_regs[5] = REG(R5); + dwarf_regs[6] = REG(R6); + dwarf_regs[7] = REG(R7); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(R11); + dwarf_regs[12] = REG(R12); + dwarf_regs[13] = REG(R13); + dwarf_regs[14] = REG(R14); + dwarf_regs[15] = REG(R15); + dwarf_regs[16] = REG(R16); + dwarf_regs[17] = REG(R17); + dwarf_regs[18] = REG(R18); + dwarf_regs[19] = REG(R19); + dwarf_regs[20] = REG(R20); + dwarf_regs[21] = REG(R21); + dwarf_regs[22] = REG(R22); + dwarf_regs[23] = REG(R23); + dwarf_regs[24] = REG(R24); + dwarf_regs[25] = REG(R25); + dwarf_regs[26] = REG(R26); + dwarf_regs[27] = REG(R27); + dwarf_regs[28] = REG(R28); + dwarf_regs[29] = REG(R29); + dwarf_regs[30] = REG(R30); + dwarf_regs[31] = REG(R31); + dwfl_thread_state_register_pc(thread, REG(PC)); + + return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs); +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c new file mode 100644 index 000000000000..1560db45e7b4 --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../arch/powerpc/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils. */ +static const int special_regs[3][2] = { + { 65, PERF_REG_POWERPC_LINK }, + { 101, PERF_REG_POWERPC_XER }, + { 109, PERF_REG_POWERPC_CTR }, +}; + +bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[32], dwarf_nip; + size_t i; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r); \ + val; \ +}) + + dwarf_regs[0] = REG(R0); + dwarf_regs[1] = REG(R1); + dwarf_regs[2] = REG(R2); + dwarf_regs[3] = REG(R3); + dwarf_regs[4] = REG(R4); + dwarf_regs[5] = REG(R5); + dwarf_regs[6] = REG(R6); + dwarf_regs[7] = REG(R7); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(R11); + dwarf_regs[12] = REG(R12); + dwarf_regs[13] = REG(R13); + dwarf_regs[14] = REG(R14); + dwarf_regs[15] = REG(R15); + dwarf_regs[16] = REG(R16); + dwarf_regs[17] = REG(R17); + dwarf_regs[18] = REG(R18); + dwarf_regs[19] = REG(R19); + dwarf_regs[20] = REG(R20); + dwarf_regs[21] = REG(R21); + dwarf_regs[22] = REG(R22); + dwarf_regs[23] = REG(R23); + dwarf_regs[24] = REG(R24); + dwarf_regs[25] = REG(R25); + dwarf_regs[26] = REG(R26); + dwarf_regs[27] = REG(R27); + dwarf_regs[28] = REG(R28); + dwarf_regs[29] = REG(R29); + dwarf_regs[30] = REG(R30); + dwarf_regs[31] = REG(R31); + if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs)) + return false; + + dwarf_nip = REG(NIP); + dwfl_thread_state_register_pc(thread, dwarf_nip); + for (i = 0; i < ARRAY_SIZE(special_regs); i++) { + Dwarf_Word val = 0; + perf_reg_value(&val, user_regs, special_regs[i][1]); + if (!dwfl_thread_state_registers(thread, + special_regs[i][0], 1, + &val)) + return false; + } + + return true; +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c new file mode 100644 index 000000000000..c2e2c4b6b2e0 --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */ + +#include +#include "../arch/riscv/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[32]; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r); \ + val; \ +}) + + dwarf_regs[0] = 0; + dwarf_regs[1] = REG(RA); + dwarf_regs[2] = REG(SP); + dwarf_regs[3] = REG(GP); + dwarf_regs[4] = REG(TP); + dwarf_regs[5] = REG(T0); + dwarf_regs[6] = REG(T1); + dwarf_regs[7] = REG(T2); + dwarf_regs[8] = REG(S0); + dwarf_regs[9] = REG(S1); + dwarf_regs[10] = REG(A0); + dwarf_regs[11] = REG(A1); + dwarf_regs[12] = REG(A2); + dwarf_regs[13] = REG(A3); + dwarf_regs[14] = REG(A4); + dwarf_regs[15] = REG(A5); + dwarf_regs[16] = REG(A6); + dwarf_regs[17] = REG(A7); + dwarf_regs[18] = REG(S2); + dwarf_regs[19] = REG(S3); + dwarf_regs[20] = REG(S4); + dwarf_regs[21] = REG(S5); + dwarf_regs[22] = REG(S6); + dwarf_regs[23] = REG(S7); + dwarf_regs[24] = REG(S8); + dwarf_regs[25] = REG(S9); + dwarf_regs[26] = REG(S10); + dwarf_regs[27] = REG(S11); + dwarf_regs[28] = REG(T3); + dwarf_regs[29] = REG(T4); + dwarf_regs[30] = REG(T5); + dwarf_regs[31] = REG(T6); + dwfl_thread_state_register_pc(thread, REG(PC)); + + return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX, + dwarf_regs); +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c new file mode 100644 index 000000000000..1e05e9d9d95f --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c @@ -0,0 +1,65 @@ +#include +#include +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/event.h" +#include "util/sample.h" +#include "../arch/s390/include/dwarf-regs-table.h" +#include "../arch/s390/include/uapi/asm/perf_regs.h" + + +bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)]; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_S390_##r); \ + val; \ +}) + /* + * For DWARF register mapping details, + * see also perf/arch/s390/include/dwarf-regs-table.h + */ + dwarf_regs[0] = REG(R0); + dwarf_regs[1] = REG(R1); + dwarf_regs[2] = REG(R2); + dwarf_regs[3] = REG(R3); + dwarf_regs[4] = REG(R4); + dwarf_regs[5] = REG(R5); + dwarf_regs[6] = REG(R6); + dwarf_regs[7] = REG(R7); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(R11); + dwarf_regs[12] = REG(R12); + dwarf_regs[13] = REG(R13); + dwarf_regs[14] = REG(R14); + dwarf_regs[15] = REG(R15); + + dwarf_regs[16] = REG(FP0); + dwarf_regs[17] = REG(FP2); + dwarf_regs[18] = REG(FP4); + dwarf_regs[19] = REG(FP6); + dwarf_regs[20] = REG(FP1); + dwarf_regs[21] = REG(FP3); + dwarf_regs[22] = REG(FP5); + dwarf_regs[23] = REG(FP7); + dwarf_regs[24] = REG(FP8); + dwarf_regs[25] = REG(FP10); + dwarf_regs[26] = REG(FP12); + dwarf_regs[27] = REG(FP14); + dwarf_regs[28] = REG(FP9); + dwarf_regs[29] = REG(FP11); + dwarf_regs[30] = REG(FP13); + dwarf_regs[31] = REG(FP15); + + dwarf_regs[64] = REG(MASK); + dwarf_regs[65] = REG(PC); + + dwfl_thread_state_register_pc(thread, dwarf_regs[65]); + return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs); +} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c new file mode 100644 index 000000000000..dd27545a4a68 --- /dev/null +++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "../arch/x86/include/uapi/asm/perf_regs.h" +#include "util/unwind-libdw.h" +#include "util/perf_regs.h" +#include "util/sample.h" + +bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word dwarf_regs[17]; + unsigned nregs; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_X86_##r); \ + val; \ +}) + + if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) { + dwarf_regs[0] = REG(AX); + dwarf_regs[1] = REG(CX); + dwarf_regs[2] = REG(DX); + dwarf_regs[3] = REG(BX); + dwarf_regs[4] = REG(SP); + dwarf_regs[5] = REG(BP); + dwarf_regs[6] = REG(SI); + dwarf_regs[7] = REG(DI); + dwarf_regs[8] = REG(IP); + nregs = 9; + } else { + dwarf_regs[0] = REG(AX); + dwarf_regs[1] = REG(DX); + dwarf_regs[2] = REG(CX); + dwarf_regs[3] = REG(BX); + dwarf_regs[4] = REG(SI); + dwarf_regs[5] = REG(DI); + dwarf_regs[6] = REG(BP); + dwarf_regs[7] = REG(SP); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(R11); + dwarf_regs[12] = REG(R12); + dwarf_regs[13] = REG(R13); + dwarf_regs[14] = REG(R14); + dwarf_regs[15] = REG(R15); + dwarf_regs[16] = REG(IP); + nregs = 17; + } + + return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs); +} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 3ff427a49e4c..b2e194a8be39 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -225,11 +225,45 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * return true; } -static const Dwfl_Thread_Callbacks callbacks = { - .next_thread = next_thread, - .memory_read = memory_read, - .set_initial_registers = libdw__arch_set_initial_registers, -}; +#define DEFINE_DWFL_THREAD_CALLBACKS(arch) \ +static const Dwfl_Thread_Callbacks callbacks_##arch = { \ + .next_thread = next_thread, \ + .memory_read = memory_read, \ + .set_initial_registers = libdw_set_initial_registers_##arch, \ +} + +DEFINE_DWFL_THREAD_CALLBACKS(x86); +DEFINE_DWFL_THREAD_CALLBACKS(arm); +DEFINE_DWFL_THREAD_CALLBACKS(arm64); +DEFINE_DWFL_THREAD_CALLBACKS(csky); +DEFINE_DWFL_THREAD_CALLBACKS(loongarch); +DEFINE_DWFL_THREAD_CALLBACKS(powerpc); +DEFINE_DWFL_THREAD_CALLBACKS(riscv); +DEFINE_DWFL_THREAD_CALLBACKS(s390); + +static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) +{ + if (!strcmp(arch, "arm")) + return &callbacks_arm; + else if (!strcmp(arch, "arm64")) + return &callbacks_arm64; + else if (!strcmp(arch, "csky")) + return &callbacks_csky; + else if (!strcmp(arch, "loongarch")) + return &callbacks_loongarch; + else if (!strcmp(arch, "powerpc")) + return &callbacks_powerpc; + else if (!strcmp(arch, "riscv")) + return &callbacks_riscv; + else if (!strcmp(arch, "s390")) + return &callbacks_s390; + else if (!strcmp(arch, "x86")) + return &callbacks_x86; + + pr_err("Fail to get thread callbacks for arch %s, returns NULL\n", + arch); + return NULL; +} static int frame_callback(Dwfl_Frame *state, void *arg) @@ -278,6 +312,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, const char *arch = perf_env__arch(ui_buf.machine->env); Dwarf_Word ip; int err = -EINVAL, i; + const Dwfl_Thread_Callbacks *callbacks; if (!data->user_regs || !data->user_regs->regs) return -EINVAL; @@ -300,7 +335,11 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (err) goto out; - err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), &callbacks, ui); + callbacks = get_thread_callbacks(arch); + if (!callbacks) + goto out; + + err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), callbacks, ui); if (err) goto out; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 8c88bc4f2304..574b29848cce 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -9,7 +9,15 @@ struct machine; struct perf_sample; struct thread; -bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg); +bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg); struct unwind_info { Dwfl *dwfl; -- cgit v1.2.3 From 6cc3e0f659b890cfb4a8753eb0e31c871cc7555b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:31 -0800 Subject: perf libdw_addr2line: Fixes to srcline memory allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some irregular stack traces are causing double frees and memory leaks. Make the code robust by proactively freeing and being more careful with the memory management of the leaf_srcline. Fixes: 88c51002d06f9a68 ("perf addr2line: Add a libdw implementation") Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/libdw.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c index e4bfd52bd172..b96c4e0d728f 100644 --- a/tools/perf/util/libdw.c +++ b/tools/perf/util/libdw.c @@ -42,16 +42,24 @@ static int libdw_a2l_cb(Dwarf_Die *die, void *_args) call_srcline = srcline_from_fileline(call_fname, die_get_call_lineno(die)); list_for_each_entry(ilist, &args->node->val, list) { + if (args->leaf_srcline == ilist->srcline) + args->leaf_srcline_used = false; + else if (ilist->srcline != srcline__unknown) + free(ilist->srcline); ilist->srcline = call_srcline; call_srcline = NULL; break; } - if (call_srcline && call_fname) + if (call_srcline && call_srcline != srcline__unknown) free(call_srcline); /* Add this symbol to the chain as the leaf. */ - inline_list__append_tail(inline_sym, args->leaf_srcline, args->node); - args->leaf_srcline_used = true; + if (!args->leaf_srcline_used) { + inline_list__append_tail(inline_sym, args->leaf_srcline, args->node); + args->leaf_srcline_used = true; + } else { + inline_list__append_tail(inline_sym, strdup(args->leaf_srcline), args->node); + } return 0; } -- cgit v1.2.3 From 8c59835851de28595e5899290f4b7aec656c7f24 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:32 -0800 Subject: perf unwind-libdw: Correct argument to dwfl_attach_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Argument is a pointer but EM_NONE (0) was being passed. Correct by passing NULL. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/unwind-libdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index b2e194a8be39..dc882f17f52d 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -339,7 +339,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (!callbacks) goto out; - err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), callbacks, ui); + err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), callbacks, ui); if (err) goto out; -- cgit v1.2.3 From b7a2b011e9627ff3359306f1eaac718baeadbd83 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:33 -0800 Subject: perf powerpc: Unify the skip-callchain-idx libdw with that for addr2line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than have 2 Dwfl unify the Dwfl in skip-callchain-idx with that is used by libdw__addr2line(). Rename that variable in 'struct dso' from 'a2l_libdw' to just 'libdw' as it is now used in more than addr2line. The Dwfl in skip-callchain-idx uses a map address when being read with dwfl_report_elf (rather than dwfl_report_offline that addr2line uses). skip-callchain-idx is wrong as the map address can vary between processes because of ASLR, ie it should need a different Dwfl per process. In the code after this patch the base address becomes 0 and the mapped PC is used with the dwfl functions. This should increase the accuracy of skip-callchain-idx, but the impact has only been build tested. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/skip-callchain-idx.c | 52 +++-------- tools/perf/util/dso.c | 2 +- tools/perf/util/dso.h | 23 +++-- tools/perf/util/libdw.c | 101 ++++++++++++---------- tools/perf/util/libdw.h | 12 ++- tools/perf/util/srcline.c | 2 +- 6 files changed, 89 insertions(+), 103 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index 356786432fd3..e57f10798fa6 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -30,14 +30,6 @@ * The libdwfl code in this file is based on code from elfutils * (libdwfl/argp-std.c, libdwfl/tests/addrcfi.c, etc). */ -static char *debuginfo_path; - -static const Dwfl_Callbacks offline_callbacks = { - .debuginfo_path = &debuginfo_path, - .find_debuginfo = dwfl_standard_find_debuginfo, - .section_address = dwfl_offline_section_address, -}; - /* * Use the DWARF expression for the Call-frame-address and determine @@ -149,44 +141,22 @@ static Dwarf_Frame *get_dwarf_frame(Dwfl_Module *mod, Dwarf_Addr pc) * yet used) * -1 in case of errors */ -static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc) +static int check_return_addr(struct dso *dso, Dwarf_Addr mapped_pc) { int rc = -1; Dwfl *dwfl; Dwfl_Module *mod; Dwarf_Frame *frame; int ra_regno; - Dwarf_Addr start = pc; - Dwarf_Addr end = pc; + Dwarf_Addr start = mapped_pc; + Dwarf_Addr end = mapped_pc; bool signalp; - const char *exec_file = dso__long_name(dso); - - dwfl = RC_CHK_ACCESS(dso)->dwfl; - - if (!dwfl) { - dwfl = dwfl_begin(&offline_callbacks); - if (!dwfl) { - pr_debug("dwfl_begin() failed: %s\n", dwarf_errmsg(-1)); - return -1; - } - - mod = dwfl_report_elf(dwfl, exec_file, exec_file, -1, - map_start, false); - if (!mod) { - pr_debug("dwfl_report_elf() failed %s\n", - dwarf_errmsg(-1)); - /* - * We normally cache the DWARF debug info and never - * call dwfl_end(). But to prevent fd leak, free in - * case of error. - */ - dwfl_end(dwfl); - goto out; - } - RC_CHK_ACCESS(dso)->dwfl = dwfl; - } - mod = dwfl_addrmodule(dwfl, pc); + dwfl = dso__libdw_dwfl(dso); + if (!dwfl) + return -1; + + mod = dwfl_addrmodule(dwfl, mapped_pc); if (!mod) { pr_debug("dwfl_addrmodule() failed, %s\n", dwarf_errmsg(-1)); goto out; @@ -196,9 +166,9 @@ static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc) * To work with split debug info files (eg: glibc), check both * .eh_frame and .debug_frame sections of the ELF header. */ - frame = get_eh_frame(mod, pc); + frame = get_eh_frame(mod, mapped_pc); if (!frame) { - frame = get_dwarf_frame(mod, pc); + frame = get_dwarf_frame(mod, mapped_pc); if (!frame) goto out; } @@ -264,7 +234,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain) return skip_slot; } - rc = check_return_addr(dso, map__start(al.map), ip); + rc = check_return_addr(dso, map__map_ip(al.map, ip)); pr_debug("[DSO %s, sym %s, ip 0x%" PRIx64 "] rc %d\n", dso__long_name(dso), al.sym->name, ip, rc); diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 143720d1ecb1..dce207c7f862 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1612,7 +1612,7 @@ void dso__delete(struct dso *dso) auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache); dso_cache__free(dso); dso__free_a2l(dso); - dso__free_a2l_libdw(dso); + dso__free_libdw(dso); dso__free_symsrc_filename(dso); nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo); mutex_destroy(dso__lock(dso)); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 4aee23775054..295388085031 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -268,11 +268,8 @@ DECLARE_RC_STRUCT(dso) { const char *short_name; const char *long_name; void *a2l; - void *a2l_libdw; + void *libdw; char *symsrc_filename; -#if defined(__powerpc__) - void *dwfl; /* DWARF debug info */ -#endif struct nsinfo *nsinfo; struct auxtrace_cache *auxtrace_cache; union { /* Tool specific area */ @@ -335,16 +332,26 @@ static inline void dso__set_a2l(struct dso *dso, void *val) RC_CHK_ACCESS(dso)->a2l = val; } -static inline void *dso__a2l_libdw(const struct dso *dso) +static inline void *dso__libdw(const struct dso *dso) { - return RC_CHK_ACCESS(dso)->a2l_libdw; + return RC_CHK_ACCESS(dso)->libdw; } -static inline void dso__set_a2l_libdw(struct dso *dso, void *val) +static inline void dso__set_libdw(struct dso *dso, void *val) { - RC_CHK_ACCESS(dso)->a2l_libdw = val; + RC_CHK_ACCESS(dso)->libdw = val; } +struct Dwfl; +#ifdef HAVE_LIBDW_SUPPORT +struct Dwfl *dso__libdw_dwfl(struct dso *dso); +#else +static inline struct Dwfl *dso__libdw_dwfl(struct dso *dso __maybe_unused) +{ + return NULL; +} +#endif + static inline unsigned int dso__a2l_fails(const struct dso *dso) { return RC_CHK_ACCESS(dso)->a2l_fails; diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c index b96c4e0d728f..216977884103 100644 --- a/tools/perf/util/libdw.c +++ b/tools/perf/util/libdw.c @@ -8,14 +8,62 @@ #include #include -void dso__free_a2l_libdw(struct dso *dso) +static const Dwfl_Callbacks offline_callbacks = { + .find_debuginfo = dwfl_standard_find_debuginfo, + .section_address = dwfl_offline_section_address, + .find_elf = dwfl_build_id_find_elf, +}; + +void dso__free_libdw(struct dso *dso) { - Dwfl *dwfl = dso__a2l_libdw(dso); + Dwfl *dwfl = dso__libdw(dso); if (dwfl) { dwfl_end(dwfl); - dso__set_a2l_libdw(dso, NULL); + dso__set_libdw(dso, NULL); + } +} + +struct Dwfl *dso__libdw_dwfl(struct dso *dso) +{ + Dwfl *dwfl = dso__libdw(dso); + const char *dso_name; + Dwfl_Module *mod; + int fd; + + if (dwfl) + return dwfl; + + dso_name = dso__long_name(dso); + /* + * Initialize Dwfl session. + * We need to open the DSO file to report it to libdw. + */ + fd = open(dso_name, O_RDONLY); + if (fd < 0) + return NULL; + + dwfl = dwfl_begin(&offline_callbacks); + if (!dwfl) { + close(fd); + return NULL; + } + + /* + * If the report is successful, the file descriptor fd is consumed + * and closed by the Dwfl. If not, it is not closed. + */ + mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd); + if (!mod) { + dwfl_end(dwfl); + close(fd); + return NULL; } + + dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL); + dso__set_libdw(dso, dwfl); + + return dwfl; } struct libdw_a2l_cb_args { @@ -63,58 +111,21 @@ static int libdw_a2l_cb(Dwarf_Die *die, void *_args) return 0; } -int libdw__addr2line(const char *dso_name, u64 addr, - char **file, unsigned int *line_nr, +int libdw__addr2line(u64 addr, char **file, unsigned int *line_nr, struct dso *dso, bool unwind_inlines, struct inline_node *node, struct symbol *sym) { - static const Dwfl_Callbacks offline_callbacks = { - .find_debuginfo = dwfl_standard_find_debuginfo, - .section_address = dwfl_offline_section_address, - .find_elf = dwfl_build_id_find_elf, - }; - Dwfl *dwfl = dso__a2l_libdw(dso); + Dwfl *dwfl = dso__libdw_dwfl(dso); Dwfl_Module *mod; Dwfl_Line *dwline; Dwarf_Addr bias; const char *src; int lineno = 0; - if (!dwfl) { - /* - * Initialize Dwfl session. - * We need to open the DSO file to report it to libdw. - */ - int fd; - - fd = open(dso_name, O_RDONLY); - if (fd < 0) - return 0; - - dwfl = dwfl_begin(&offline_callbacks); - if (!dwfl) { - close(fd); - return 0; - } - - /* - * If the report is successful, the file descriptor fd is consumed - * and closed by the Dwfl. If not, it is not closed. - */ - mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd); - if (!mod) { - dwfl_end(dwfl); - close(fd); - return 0; - } - - dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL); - dso__set_a2l_libdw(dso, dwfl); - } else { - /* Dwfl session already initialized, get module for address. */ - mod = dwfl_addrmodule(dwfl, addr); - } + if (!dwfl) + return 0; + mod = dwfl_addrmodule(dwfl, addr); if (!mod) return 0; diff --git a/tools/perf/util/libdw.h b/tools/perf/util/libdw.h index 0f8d7b4a11a5..b12094737415 100644 --- a/tools/perf/util/libdw.h +++ b/tools/perf/util/libdw.h @@ -11,7 +11,6 @@ struct symbol; #ifdef HAVE_LIBDW_SUPPORT /* * libdw__addr2line - Convert address to source location using libdw - * @dso_name: Name of the DSO * @addr: Address to resolve * @file: Pointer to return filename (caller must free) * @line_nr: Pointer to return line number @@ -26,23 +25,22 @@ struct symbol; * * Returns 1 on success (found), 0 on failure (not found). */ -int libdw__addr2line(const char *dso_name, u64 addr, char **file, +int libdw__addr2line(u64 addr, char **file, unsigned int *line_nr, struct dso *dso, bool unwind_inlines, struct inline_node *node, struct symbol *sym); /* - * dso__free_a2l_libdw - Free libdw resources associated with the DSO + * dso__free_libdw - Free libdw resources associated with the DSO * @dso: The dso to free resources for * * This function cleans up the Dwfl context used for addr2line lookups. */ -void dso__free_a2l_libdw(struct dso *dso); +void dso__free_libdw(struct dso *dso); #else /* HAVE_LIBDW_SUPPORT */ -static inline int libdw__addr2line(const char *dso_name __maybe_unused, - u64 addr __maybe_unused, char **file __maybe_unused, +static inline int libdw__addr2line(u64 addr __maybe_unused, char **file __maybe_unused, unsigned int *line_nr __maybe_unused, struct dso *dso __maybe_unused, bool unwind_inlines __maybe_unused, @@ -52,7 +50,7 @@ static inline int libdw__addr2line(const char *dso_name __maybe_unused, return 0; } -static inline void dso__free_a2l_libdw(struct dso *dso __maybe_unused) +static inline void dso__free_libdw(struct dso *dso __maybe_unused) { } #endif /* HAVE_LIBDW_SUPPORT */ diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 28fa1abd1fd3..9be42f398440 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -161,7 +161,7 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int * for (size_t i = 0; i < ARRAY_SIZE(symbol_conf.addr2line_style); i++) { switch (symbol_conf.addr2line_style[i]) { case A2L_STYLE_LIBDW: - ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, + ret = libdw__addr2line(addr, file, line_nr, dso, unwind_inlines, node, sym); break; case A2L_STYLE_LLVM: -- cgit v1.2.3 From db0c35ca36526f3072affcb573631ccf8c85f827 Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Sat, 17 Jan 2026 02:46:34 +0900 Subject: kunit: add bash completion Currently, kunit.py has many subcommands and options, making it difficult to remember them without checking the help message. Add --list-cmds and --list-opts to kunit.py to get available commands and options, use those outputs in kunit-completion.sh to show completion. This implementation is similar to perf and tools/perf/perf-completion.sh. Example output: $ source tools/testing/kunit/kunit-completion.sh $ ./tools/testing/kunit/kunit.py [TAB][TAB] build config exec parse run $ ./tools/testing/kunit/kunit.py run --k[TAB][TAB] --kconfig_add --kernel_args --kunitconfig Link: https://lore.kernel.org/r/20260117-kunit-completion-v2-1-cabd127d0801@gmail.com Reviewed-by: David Gow Signed-off-by: Ryota Sakamoto Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/run_wrapper.rst | 9 +++++++ tools/testing/kunit/kunit-completion.sh | 34 +++++++++++++++++++++++++++ tools/testing/kunit/kunit.py | 30 +++++++++++++++++++++++ tools/testing/kunit/kunit_tool_test.py | 21 +++++++++++++++++ 4 files changed, 94 insertions(+) create mode 100644 tools/testing/kunit/kunit-completion.sh (limited to 'tools') diff --git a/Documentation/dev-tools/kunit/run_wrapper.rst b/Documentation/dev-tools/kunit/run_wrapper.rst index 6697c71ee8ca..3c0b585dcfff 100644 --- a/Documentation/dev-tools/kunit/run_wrapper.rst +++ b/Documentation/dev-tools/kunit/run_wrapper.rst @@ -335,3 +335,12 @@ command line arguments: - ``--list_tests_attr``: If set, lists all tests that will be run and all of their attributes. + +Command-line completion +============================== + +The kunit_tool comes with a bash completion script: + +.. code-block:: bash + + source tools/testing/kunit/kunit-completion.sh diff --git a/tools/testing/kunit/kunit-completion.sh b/tools/testing/kunit/kunit-completion.sh new file mode 100644 index 000000000000..f053e7b5d265 --- /dev/null +++ b/tools/testing/kunit/kunit-completion.sh @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0 +# bash completion support for KUnit + +_kunit_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +_kunit() +{ + local cur prev words cword + _init_completion || return + + local script="${_kunit_dir}/kunit.py" + + if [[ $cword -eq 1 && "$cur" != -* ]]; then + local cmds=$(${script} --list-cmds 2>/dev/null) + COMPREPLY=($(compgen -W "${cmds}" -- "$cur")) + return 0 + fi + + if [[ "$cur" == -* ]]; then + if [[ -n "${words[1]}" && "${words[1]}" != -* ]]; then + local opts=$(${script} ${words[1]} --list-opts 2>/dev/null) + COMPREPLY=($(compgen -W "${opts}" -- "$cur")) + return 0 + else + local opts=$(${script} --list-opts 2>/dev/null) + COMPREPLY=($(compgen -W "${opts}" -- "$cur")) + return 0 + fi + fi +} + +complete -o default -F _kunit kunit.py +complete -o default -F _kunit kunit +complete -o default -F _kunit ./tools/testing/kunit/kunit.py diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index e3d82a038f93..4ec5ecba6d49 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -328,6 +328,17 @@ def get_default_build_dir() -> str: return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit') return '.kunit' +def add_completion_opts(parser: argparse.ArgumentParser) -> None: + parser.add_argument('--list-opts', + help=argparse.SUPPRESS, + action='store_true') + +def add_root_opts(parser: argparse.ArgumentParser) -> None: + parser.add_argument('--list-cmds', + help=argparse.SUPPRESS, + action='store_true') + add_completion_opts(parser) + def add_common_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' @@ -379,6 +390,8 @@ def add_common_opts(parser: argparse.ArgumentParser) -> None: help='Additional QEMU arguments, e.g. "-smp 8"', action='append', metavar='') + add_completion_opts(parser) + def add_build_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--jobs', help='As in the make command, "Specifies the number of ' @@ -574,6 +587,7 @@ subcommand_handlers_map = { def main(argv: Sequence[str]) -> None: parser = argparse.ArgumentParser( description='Helps writing and running KUnit tests.') + add_root_opts(parser) subparser = parser.add_subparsers(dest='subcommand') # The 'run' command will config, build, exec, and parse in one go. @@ -608,12 +622,28 @@ def main(argv: Sequence[str]) -> None: parse_parser.add_argument('file', help='Specifies the file to read results from.', type=str, nargs='?', metavar='input_file') + add_completion_opts(parse_parser) cli_args = parser.parse_args(massage_argv(argv)) if get_kernel_root_path(): os.chdir(get_kernel_root_path()) + if cli_args.list_cmds: + print(" ".join(subparser.choices.keys())) + return + + if cli_args.list_opts: + target_parser = subparser.choices.get(cli_args.subcommand) + if not target_parser: + target_parser = parser + + # Accessing private attribute _option_string_actions to get + # the list of options. This is not a public API, but argparse + # does not provide a way to inspect options programmatically. + print(' '.join(target_parser._option_string_actions.keys())) + return + subcomand_handler = subcommand_handlers_map.get(cli_args.subcommand, None) if subcomand_handler is None: diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 238a31a5cc29..b67408147c1f 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -11,11 +11,13 @@ from unittest import mock import tempfile, shutil # Handling test_tmpdir +import io import itertools import json import os import signal import subprocess +import sys from typing import Iterable import kunit_config @@ -886,5 +888,24 @@ class KUnitMainTest(unittest.TestCase): mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test1', filter='', filter_action=None, timeout=300), ]) + @mock.patch.object(sys, 'stdout', new_callable=io.StringIO) + def test_list_cmds(self, mock_stdout): + kunit.main(['--list-cmds']) + output = mock_stdout.getvalue() + output_cmds = sorted(output.split()) + expected_cmds = sorted(['build', 'config', 'exec', 'parse', 'run']) + self.assertEqual(output_cmds, expected_cmds) + + @mock.patch.object(sys, 'stdout', new_callable=io.StringIO) + def test_run_list_opts(self, mock_stdout): + kunit.main(['run', '--list-opts']) + output = mock_stdout.getvalue() + output_cmds = set(output.split()) + self.assertIn('--help', output_cmds) + self.assertIn('--kunitconfig', output_cmds) + self.assertIn('--jobs', output_cmds) + self.assertIn('--kernel_args', output_cmds) + self.assertIn('--raw_output', output_cmds) + if __name__ == '__main__': unittest.main() -- cgit v1.2.3 From a457ef08a72cb408318ddb851865c5981b842c63 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:34 -0800 Subject: perf perf_regs: Switch from arch string to int e_machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arch string requires multiple strcmp to identify things like the IP and SP. Switch to passing in an e_machine that in the bulk of cases is computed using a current thread load. The e_machine also allows identification of 32-bit vs 64-bit processes. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon [ Include dwarf-regs.h to get conditional defines for EM_CSKY and EM_LOONGARCH, not available in old distros ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 17 ++-- tools/perf/util/evsel.c | 14 ++- tools/perf/util/perf_regs.c | 106 +++++++++++++-------- tools/perf/util/perf_regs.h | 10 +- .../util/scripting-engines/trace-event-python.c | 21 ++-- tools/perf/util/session.c | 65 +++++++++---- tools/perf/util/session.h | 1 + tools/perf/util/unwind-libdw.c | 12 ++- tools/perf/util/unwind-libunwind-local.c | 7 +- 9 files changed, 165 insertions(+), 88 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 62e43d3c5ad7..372bede30230 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -717,7 +717,7 @@ out: return 0; } -static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, const char *arch, +static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, FILE *fp) { unsigned i = 0, r; @@ -730,7 +730,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, cons for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) { u64 val = regs->regs[i++]; - printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, arch), val); + printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, e_machine), val); } return printed; @@ -787,23 +787,23 @@ tod_scnprintf(struct perf_script *script, char *buf, int buflen, } static int perf_sample__fprintf_iregs(struct perf_sample *sample, - struct perf_event_attr *attr, const char *arch, FILE *fp) + struct perf_event_attr *attr, uint16_t e_machine, FILE *fp) { if (!sample->intr_regs) return 0; return perf_sample__fprintf_regs(perf_sample__intr_regs(sample), - attr->sample_regs_intr, arch, fp); + attr->sample_regs_intr, e_machine, fp); } static int perf_sample__fprintf_uregs(struct perf_sample *sample, - struct perf_event_attr *attr, const char *arch, FILE *fp) + struct perf_event_attr *attr, uint16_t e_machine, FILE *fp) { if (!sample->user_regs) return 0; return perf_sample__fprintf_regs(perf_sample__user_regs(sample), - attr->sample_regs_user, arch, fp); + attr->sample_regs_user, e_machine, fp); } static int perf_sample__fprintf_start(struct perf_script *script, @@ -2418,7 +2418,6 @@ static void process_event(struct perf_script *script, struct evsel_script *es = evsel->priv; FILE *fp = es->fp; char str[PAGE_SIZE_NAME_LEN]; - const char *arch = perf_env__arch(machine->env); if (output[type].fields == 0) return; @@ -2506,10 +2505,10 @@ static void process_event(struct perf_script *script, } if (PRINT_FIELD(IREGS)) - perf_sample__fprintf_iregs(sample, attr, arch, fp); + perf_sample__fprintf_iregs(sample, attr, thread__e_machine(thread, machine), fp); if (PRINT_FIELD(UREGS)) - perf_sample__fprintf_uregs(sample, attr, arch, fp); + perf_sample__fprintf_uregs(sample, attr, thread__e_machine(thread, machine), fp); if (PRINT_FIELD(BRSTACK)) perf_sample__fprintf_brstack(sample, thread, evsel, fp); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6d324141588c..5ac1a05601b1 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -34,6 +34,7 @@ #include "callchain.h" #include "cgroup.h" #include "counts.h" +#include "dwarf-regs.h" #include "event.h" #include "evsel.h" #include "time-utils.h" @@ -1007,6 +1008,13 @@ int evsel__group_desc(struct evsel *evsel, char *buf, size_t size) return ret; } +static uint16_t evsel__e_machine(struct evsel *evsel) +{ + struct perf_session *session = evsel__session(evsel); + + return session ? perf_session__e_machine(session) : EM_HOST; +} + static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts, struct callchain_param *param) { @@ -1042,13 +1050,13 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o if (param->record_mode == CALLCHAIN_DWARF) { if (!function) { - const char *arch = perf_env__arch(evsel__env(evsel)); + uint16_t e_machine = evsel__e_machine(evsel); evsel__set_sample_bit(evsel, REGS_USER); evsel__set_sample_bit(evsel, STACK_USER); if (opts->sample_user_regs && - DWARF_MINIMAL_REGS(arch) != arch__user_reg_mask()) { - attr->sample_regs_user |= DWARF_MINIMAL_REGS(arch); + DWARF_MINIMAL_REGS(e_machine) != arch__user_reg_mask()) { + attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine); pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, " "specifying a subset with --user-regs may render DWARF unwinding unreliable, " "so the minimal registers set (IP, SP) is explicitly forced.\n"); diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index 44b90bbf2d07..f9723091e673 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include +#include "dwarf-regs.h" #include "perf_regs.h" #include "util/sample.h" #include "debug.h" @@ -30,30 +32,48 @@ const struct sample_reg * __weak arch__sample_reg_masks(void) return sample_reg_masks; } -const char *perf_reg_name(int id, const char *arch) +const char *perf_reg_name(int id, uint16_t e_machine) { const char *reg_name = NULL; - if (!strcmp(arch, "csky")) + switch (e_machine) { + case EM_ARM: + reg_name = __perf_reg_name_arm(id); + break; + case EM_AARCH64: + reg_name = __perf_reg_name_arm64(id); + break; + case EM_CSKY: reg_name = __perf_reg_name_csky(id); - else if (!strcmp(arch, "loongarch")) + break; + case EM_LOONGARCH: reg_name = __perf_reg_name_loongarch(id); - else if (!strcmp(arch, "mips")) + break; + case EM_MIPS: reg_name = __perf_reg_name_mips(id); - else if (!strcmp(arch, "powerpc")) + break; + case EM_PPC: + case EM_PPC64: reg_name = __perf_reg_name_powerpc(id); - else if (!strcmp(arch, "riscv")) + break; + case EM_RISCV: reg_name = __perf_reg_name_riscv(id); - else if (!strcmp(arch, "s390")) + break; + case EM_S390: reg_name = __perf_reg_name_s390(id); - else if (!strcmp(arch, "x86")) + break; + case EM_386: + case EM_X86_64: reg_name = __perf_reg_name_x86(id); - else if (!strcmp(arch, "arm")) - reg_name = __perf_reg_name_arm(id); - else if (!strcmp(arch, "arm64")) - reg_name = __perf_reg_name_arm64(id); + break; + default: + break; + } + if (reg_name) + return reg_name; - return reg_name ?: "unknown"; + pr_debug("Failed to find register %d for ELF machine type %u\n", id, e_machine); + return "unknown"; } int perf_reg_value(u64 *valp, struct regs_dump *regs, int id) @@ -83,52 +103,60 @@ out: return 0; } -uint64_t perf_arch_reg_ip(const char *arch) +uint64_t perf_arch_reg_ip(uint16_t e_machine) { - if (!strcmp(arch, "arm")) + switch (e_machine) { + case EM_ARM: return __perf_reg_ip_arm(); - else if (!strcmp(arch, "arm64")) + case EM_AARCH64: return __perf_reg_ip_arm64(); - else if (!strcmp(arch, "csky")) + case EM_CSKY: return __perf_reg_ip_csky(); - else if (!strcmp(arch, "loongarch")) + case EM_LOONGARCH: return __perf_reg_ip_loongarch(); - else if (!strcmp(arch, "mips")) + case EM_MIPS: return __perf_reg_ip_mips(); - else if (!strcmp(arch, "powerpc")) + case EM_PPC: + case EM_PPC64: return __perf_reg_ip_powerpc(); - else if (!strcmp(arch, "riscv")) + case EM_RISCV: return __perf_reg_ip_riscv(); - else if (!strcmp(arch, "s390")) + case EM_S390: return __perf_reg_ip_s390(); - else if (!strcmp(arch, "x86")) + case EM_386: + case EM_X86_64: return __perf_reg_ip_x86(); - - pr_err("Fail to find IP register for arch %s, returns 0\n", arch); - return 0; + default: + pr_err("Failed to find IP register for ELF machine type %u\n", e_machine); + return 0; + } } -uint64_t perf_arch_reg_sp(const char *arch) +uint64_t perf_arch_reg_sp(uint16_t e_machine) { - if (!strcmp(arch, "arm")) + switch (e_machine) { + case EM_ARM: return __perf_reg_sp_arm(); - else if (!strcmp(arch, "arm64")) + case EM_AARCH64: return __perf_reg_sp_arm64(); - else if (!strcmp(arch, "csky")) + case EM_CSKY: return __perf_reg_sp_csky(); - else if (!strcmp(arch, "loongarch")) + case EM_LOONGARCH: return __perf_reg_sp_loongarch(); - else if (!strcmp(arch, "mips")) + case EM_MIPS: return __perf_reg_sp_mips(); - else if (!strcmp(arch, "powerpc")) + case EM_PPC: + case EM_PPC64: return __perf_reg_sp_powerpc(); - else if (!strcmp(arch, "riscv")) + case EM_RISCV: return __perf_reg_sp_riscv(); - else if (!strcmp(arch, "s390")) + case EM_S390: return __perf_reg_sp_s390(); - else if (!strcmp(arch, "x86")) + case EM_386: + case EM_X86_64: return __perf_reg_sp_x86(); - - pr_err("Fail to find SP register for arch %s, returns 0\n", arch); - return 0; + default: + pr_err("Failed to find SP register for ELF machine type %u\n", e_machine); + return 0; + } } diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h index f2d0736d65cc..7bfc6a34c02b 100644 --- a/tools/perf/util/perf_regs.h +++ b/tools/perf/util/perf_regs.h @@ -28,10 +28,10 @@ uint64_t arch__intr_reg_mask(void); uint64_t arch__user_reg_mask(void); const struct sample_reg *arch__sample_reg_masks(void); -const char *perf_reg_name(int id, const char *arch); +const char *perf_reg_name(int id, uint16_t e_machine); int perf_reg_value(u64 *valp, struct regs_dump *regs, int id); -uint64_t perf_arch_reg_ip(const char *arch); -uint64_t perf_arch_reg_sp(const char *arch); +uint64_t perf_arch_reg_ip(uint16_t e_machine); +uint64_t perf_arch_reg_sp(uint16_t e_machine); const char *__perf_reg_name_arm64(int id); uint64_t __perf_reg_ip_arm64(void); uint64_t __perf_reg_sp_arm64(void); @@ -60,9 +60,9 @@ const char *__perf_reg_name_x86(int id); uint64_t __perf_reg_ip_x86(void); uint64_t __perf_reg_sp_x86(void); -static inline uint64_t DWARF_MINIMAL_REGS(const char *arch) +static inline uint64_t DWARF_MINIMAL_REGS(uint16_t e_machine) { - return (1ULL << perf_arch_reg_ip(arch)) | (1ULL << perf_arch_reg_sp(arch)); + return (1ULL << perf_arch_reg_ip(e_machine)) | (1ULL << perf_arch_reg_sp(e_machine)); } #endif /* __PERF_REGS_H */ diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 6655c0bbe0d8..b90edc147796 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -50,6 +50,7 @@ #include "../thread-stack.h" #include "../trace-event.h" #include "../call-path.h" +#include "dwarf-regs.h" #include "map.h" #include "symbol.h" #include "thread_map.h" @@ -713,7 +714,7 @@ static void set_sample_datasrc_in_dict(PyObject *dict, _PyUnicode_FromString(decode)); } -static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, char *bf, int size) +static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, char *bf, int size) { unsigned int i = 0, r; int printed = 0; @@ -731,7 +732,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch printed += scnprintf(bf + printed, size - printed, "%5s:0x%" PRIx64 " ", - perf_reg_name(r, arch), val); + perf_reg_name(r, e_machine), val); } } @@ -739,10 +740,10 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch static int set_regs_in_dict(PyObject *dict, struct perf_sample *sample, - struct evsel *evsel) + struct evsel *evsel, + uint16_t e_machine) { struct perf_event_attr *attr = &evsel->core.attr; - const char *arch = perf_env__arch(evsel__env(evsel)); int size = (__sw_hweight64(attr->sample_regs_intr) * MAX_REG_SIZE) + 1; char *bf = NULL; @@ -752,7 +753,7 @@ static int set_regs_in_dict(PyObject *dict, if (!bf) return -1; - regs_map(sample->intr_regs, attr->sample_regs_intr, arch, bf, size); + regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, bf, size); pydict_set_item_string_decref(dict, "iregs", _PyUnicode_FromString(bf)); @@ -764,7 +765,7 @@ static int set_regs_in_dict(PyObject *dict, if (!bf) return -1; } - regs_map(sample->user_regs, attr->sample_regs_user, arch, bf, size); + regs_map(sample->user_regs, attr->sample_regs_user, e_machine, bf, size); pydict_set_item_string_decref(dict, "uregs", _PyUnicode_FromString(bf)); @@ -834,6 +835,8 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, PyObject *callchain) { PyObject *dict, *dict_sample, *brstack, *brstacksym; + struct machine *machine; + uint16_t e_machine = EM_HOST; dict = PyDict_New(); if (!dict) @@ -920,7 +923,11 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, PyLong_FromUnsignedLongLong(sample->cyc_cnt)); } - if (set_regs_in_dict(dict, sample, evsel)) + if (al->thread) { + machine = maps__machine(thread__maps(al->thread)); + e_machine = thread__e_machine(al->thread, machine); + } + if (set_regs_in_dict(dict, sample, evsel, e_machine)) Py_FatalError("Failed to setting regs in dict"); return dict; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 922ef6577bbb..d7b28cb4e672 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -17,6 +17,7 @@ #include "map_symbol.h" #include "branch.h" #include "debug.h" +#include "dwarf-regs.h" #include "env.h" #include "evlist.h" #include "evsel.h" @@ -942,7 +943,7 @@ static void branch_stack__printf(struct perf_sample *sample, } } -static void regs_dump__printf(u64 mask, u64 *regs, const char *arch) +static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine) { unsigned rid, i = 0; @@ -950,7 +951,7 @@ static void regs_dump__printf(u64 mask, u64 *regs, const char *arch) u64 val = regs[i++]; printf(".... %-5s 0x%016" PRIx64 "\n", - perf_reg_name(rid, arch), val); + perf_reg_name(rid, e_machine), val); } } @@ -968,7 +969,7 @@ static inline const char *regs_dump_abi(struct regs_dump *d) return regs_abi[d->abi]; } -static void regs__printf(const char *type, struct regs_dump *regs, const char *arch) +static void regs__printf(const char *type, struct regs_dump *regs, uint16_t e_machine) { u64 mask = regs->mask; @@ -977,10 +978,10 @@ static void regs__printf(const char *type, struct regs_dump *regs, const char *a mask, regs_dump_abi(regs)); - regs_dump__printf(mask, regs->regs, arch); + regs_dump__printf(mask, regs->regs, e_machine); } -static void regs_user__printf(struct perf_sample *sample, const char *arch) +static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine) { struct regs_dump *user_regs; @@ -990,10 +991,10 @@ static void regs_user__printf(struct perf_sample *sample, const char *arch) user_regs = perf_sample__user_regs(sample); if (user_regs->regs) - regs__printf("user", user_regs, arch); + regs__printf("user", user_regs, e_machine); } -static void regs_intr__printf(struct perf_sample *sample, const char *arch) +static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine) { struct regs_dump *intr_regs; @@ -1003,7 +1004,7 @@ static void regs_intr__printf(struct perf_sample *sample, const char *arch) intr_regs = perf_sample__intr_regs(sample); if (intr_regs->regs) - regs__printf("intr", intr_regs, arch); + regs__printf("intr", intr_regs, e_machine); } static void stack_user__printf(struct stack_dump *dump) @@ -1092,21 +1093,28 @@ char *get_page_size_name(u64 size, char *str) return str; } -static void dump_sample(struct evsel *evsel, union perf_event *event, - struct perf_sample *sample, const char *arch) +static void dump_sample(struct machine *machine, struct evsel *evsel, union perf_event *event, + struct perf_sample *sample) { u64 sample_type; char str[PAGE_SIZE_NAME_LEN]; + uint16_t e_machine = EM_NONE; if (!dump_trace) return; + sample_type = evsel->core.attr.sample_type; + + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) { + struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid); + + e_machine = thread__e_machine(thread, machine); + } + printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n", event->header.misc, sample->pid, sample->tid, sample->ip, sample->period, sample->addr); - sample_type = evsel->core.attr.sample_type; - if (evsel__has_callchain(evsel)) callchain__printf(evsel, sample); @@ -1114,10 +1122,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, branch_stack__printf(sample, evsel); if (sample_type & PERF_SAMPLE_REGS_USER) - regs_user__printf(sample, arch); + regs_user__printf(sample, e_machine); if (sample_type & PERF_SAMPLE_REGS_INTR) - regs_intr__printf(sample, arch); + regs_intr__printf(sample, e_machine); if (sample_type & PERF_SAMPLE_STACK_USER) stack_user__printf(&sample->user_stack); @@ -1432,10 +1440,10 @@ static int machines__deliver_event(struct machines *machines, } if (machine == NULL) { ++evlist->stats.nr_unprocessable_samples; - dump_sample(evsel, event, sample, perf_env__arch(NULL)); + dump_sample(machine, evsel, event, sample); return 0; } - dump_sample(evsel, event, sample, perf_env__arch(machine->env)); + dump_sample(machine, evsel, event, sample); if (sample->deferred_callchain && tool->merge_deferred_callchains) { struct deferred_event *de = malloc(sizeof(*de)); size_t sz = event->header.size; @@ -2928,3 +2936,28 @@ struct perf_env *perf_session__env(struct perf_session *session) { return &session->header.env; } + +static int perf_session__e_machine_cb(struct thread *thread, + void *arg __maybe_unused) +{ + uint16_t *result = arg; + struct machine *machine = maps__machine(thread__maps(thread)); + + *result = thread__e_machine(thread, machine); + return *result != EM_NONE ? 1 : 0; +} + +/* + * Note, a machine may have mixed 32-bit and 64-bit processes and so mixed + * e_machines. Use thread__e_machine when this matters. + */ +uint16_t perf_session__e_machine(struct perf_session *session) +{ + uint16_t e_machine = EM_NONE; + + machines__for_each_thread(&session->machines, + perf_session__e_machine_cb, + &e_machine); + + return e_machine == EM_NONE ? EM_HOST : e_machine; +} diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 22d3ff877e83..eddc4c630b33 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -211,5 +211,6 @@ int perf_event__process_finished_round(const struct perf_tool *tool, struct ordered_events *oe); struct perf_env *perf_session__env(struct perf_session *session); +uint16_t perf_session__e_machine(struct perf_session *session); #endif /* __PERF_SESSION_H */ diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index dc882f17f52d..c25190cdceb4 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -187,7 +187,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * void *arg) { struct unwind_info *ui = arg; - const char *arch = perf_env__arch(ui->machine->env); + uint16_t e_machine = thread__e_machine(ui->thread, ui->machine); struct stack_dump *stack = &ui->sample->user_stack; u64 start, end; int offset; @@ -197,7 +197,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * return false; ret = perf_reg_value(&start, ui->sample->user_regs, - perf_arch_reg_sp(arch)); + perf_arch_reg_sp(e_machine)); if (ret) return false; @@ -300,16 +300,18 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, int max_stack, bool best_effort) { + struct machine *machine = maps__machine(thread__maps(thread)); struct unwind_info *ui, ui_buf = { .sample = data, .thread = thread, - .machine = maps__machine((thread__maps(thread))), + .machine = machine, .cb = cb, .arg = arg, .max_stack = max_stack, .best_effort = best_effort }; - const char *arch = perf_env__arch(ui_buf.machine->env); + uint16_t e_machine = thread__e_machine(thread, machine); + const char *arch = perf_env__arch(machine->env); Dwarf_Word ip; int err = -EINVAL, i; const Dwfl_Thread_Callbacks *callbacks; @@ -327,7 +329,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (!ui->dwfl) goto out; - err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(arch)); + err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(e_machine)); if (err) goto out; diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 0b037e7389a0..a24b45106acd 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -572,7 +572,6 @@ static int access_mem(unw_addr_space_t __maybe_unused as, int __write, void *arg) { struct unwind_info *ui = arg; - const char *arch = perf_env__arch(ui->machine->env); struct stack_dump *stack = &ui->sample->user_stack; u64 start, end; int offset; @@ -585,7 +584,7 @@ static int access_mem(unw_addr_space_t __maybe_unused as, } ret = perf_reg_value(&start, perf_sample__user_regs(ui->sample), - perf_arch_reg_sp(arch)); + perf_arch_reg_sp(thread__e_machine(ui->thread, ui->machine))); if (ret) return ret; @@ -734,7 +733,7 @@ static void _unwind__finish_access(struct maps *maps) static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, void *arg, int max_stack) { - const char *arch = perf_env__arch(ui->machine->env); + uint16_t e_machine = thread__e_machine(ui->thread, ui->machine); u64 val; unw_word_t ips[max_stack]; unw_addr_space_t addr_space; @@ -742,7 +741,7 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, int ret, i = 0; ret = perf_reg_value(&val, perf_sample__user_regs(ui->sample), - perf_arch_reg_ip(arch)); + perf_arch_reg_ip(e_machine)); if (ret) return ret; -- cgit v1.2.3 From 1672f3707a6ef4b386c30bb76df2f62e58a39430 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:35 -0800 Subject: perf dwarf-regs: Add util/dwarf-regs-arch for consistency with perf-regs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit perf_regs.h has cross architecture functions for operating with the differing perf register constants. dwarf-regs.h is similar but for cross architecture dwarf notions of registers. For consistency move the arch parts of dwarf-regs out of util and into its own directory. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 4 +- tools/perf/util/dwarf-regs-arch/Build | 3 ++ tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c | 50 ++++++++++++++++++ .../perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c | 61 ++++++++++++++++++++++ tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c | 50 ++++++++++++++++++ tools/perf/util/dwarf-regs-csky.c | 50 ------------------ tools/perf/util/dwarf-regs-powerpc.c | 61 ---------------------- tools/perf/util/dwarf-regs-x86.c | 50 ------------------ 8 files changed, 165 insertions(+), 164 deletions(-) create mode 100644 tools/perf/util/dwarf-regs-arch/Build create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c delete mode 100644 tools/perf/util/dwarf-regs-csky.c delete mode 100644 tools/perf/util/dwarf-regs-powerpc.c delete mode 100644 tools/perf/util/dwarf-regs-x86.c (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 5efec73be474..3cb1edd263cf 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -219,9 +219,7 @@ endif perf-util-$(CONFIG_LIBDW) += probe-finder.o perf-util-$(CONFIG_LIBDW) += dwarf-aux.o perf-util-$(CONFIG_LIBDW) += dwarf-regs.o -perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o -perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o -perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-arch/ perf-util-$(CONFIG_LIBDW) += debuginfo.o perf-util-$(CONFIG_LIBDW) += annotate-data.o perf-util-$(CONFIG_LIBDW) += libdw.o diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build new file mode 100644 index 000000000000..98bec0032606 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/Build @@ -0,0 +1,3 @@ +perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c new file mode 100644 index 000000000000..d38ef1f07f3e --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. +// Mapping of DWARF debug register numbers into register names. + +#include +#include + +#define CSKY_ABIV2_MAX_REGS 73 +const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = { + /* r0 ~ r8 */ + "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3", + /* r9 ~ r15 */ + "%regs4", "%regs5", "%regs6", "%regs7", "%regs8", "%regs9", "%sp", + "%lr", + /* r16 ~ r23 */ + "%exregs0", "%exregs1", "%exregs2", "%exregs3", "%exregs4", + "%exregs5", "%exregs6", "%exregs7", + /* r24 ~ r31 */ + "%exregs8", "%exregs9", "%exregs10", "%exregs11", "%exregs12", + "%exregs13", "%exregs14", "%tls", + "%pc", NULL, NULL, NULL, "%hi", "%lo", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "%epc", +}; + +#define CSKY_ABIV1_MAX_REGS 57 +const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = { + /* r0 ~ r8 */ + "%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", + /* r9 ~ r15 */ + "%regs2", "%regs3", "%regs4", "%regs5", "%regs6", "%regs7", "%regs8", + "%lr", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "%epc", +}; + +const char *get_csky_regstr(unsigned int n, unsigned int flags) +{ + if (flags & EF_CSKY_ABIV2) + return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL; + + return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL; +} diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c new file mode 100644 index 000000000000..caf77a234c78 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Mapping of DWARF debug register numbers into register names. + * + * Copyright (C) 2010 Ian Munsie, IBM Corporation. + */ + +#include + +#define PPC_OP(op) (((op) >> 26) & 0x3F) +#define PPC_RA(a) (((a) >> 16) & 0x1f) +#define PPC_RT(t) (((t) >> 21) & 0x1f) +#define PPC_RB(b) (((b) >> 11) & 0x1f) +#define PPC_D(D) ((D) & 0xfffe) +#define PPC_DS(DS) ((DS) & 0xfffc) +#define OP_LD 58 +#define OP_STD 62 + +static int get_source_reg(u32 raw_insn) +{ + return PPC_RA(raw_insn); +} + +static int get_target_reg(u32 raw_insn) +{ + return PPC_RT(raw_insn); +} + +static int get_offset_opcode(u32 raw_insn) +{ + int opcode = PPC_OP(raw_insn); + + /* DS- form */ + if ((opcode == OP_LD) || (opcode == OP_STD)) + return PPC_DS(raw_insn); + else + return PPC_D(raw_insn); +} + +/* + * Fills the required fields for op_loc depending on if it + * is a source or target. + * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT + * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT + * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT + */ +void get_powerpc_regs(u32 raw_insn, int is_source, + struct annotated_op_loc *op_loc) +{ + if (is_source) + op_loc->reg1 = get_source_reg(raw_insn); + else + op_loc->reg1 = get_target_reg(raw_insn); + + if (op_loc->multi_regs) + op_loc->reg2 = PPC_RB(raw_insn); + + /* TODO: Implement offset handling for X Form */ + if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31)) + op_loc->offset = get_offset_opcode(raw_insn); +} diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c new file mode 100644 index 000000000000..7a55c65e8da6 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * dwarf-regs.c : Mapping of DWARF debug register numbers into register names. + * Extracted from probe-finder.c + * + * Written by Masami Hiramatsu + */ + +#include /* for EINVAL */ +#include /* for strcmp */ +#include /* for ARRAY_SIZE */ +#include + +struct dwarf_regs_idx { + const char *name; + int idx; +}; + +static const struct dwarf_regs_idx x86_regidx_table[] = { + { "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 }, + { "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 }, + { "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 }, + { "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 }, + { "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 }, + { "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 }, + { "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 }, + { "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 }, + { "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 }, + { "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 }, + { "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 }, + { "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 }, + { "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 }, + { "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 }, + { "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 }, + { "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 }, + { "rip", DWARF_REG_PC }, +}; + +int get_x86_regnum(const char *name) +{ + unsigned int i; + + if (*name != '%') + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++) + if (!strcmp(x86_regidx_table[i].name, name + 1)) + return x86_regidx_table[i].idx; + return -ENOENT; +} diff --git a/tools/perf/util/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-csky.c deleted file mode 100644 index d38ef1f07f3e..000000000000 --- a/tools/perf/util/dwarf-regs-csky.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. -// Mapping of DWARF debug register numbers into register names. - -#include -#include - -#define CSKY_ABIV2_MAX_REGS 73 -const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = { - /* r0 ~ r8 */ - "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3", - /* r9 ~ r15 */ - "%regs4", "%regs5", "%regs6", "%regs7", "%regs8", "%regs9", "%sp", - "%lr", - /* r16 ~ r23 */ - "%exregs0", "%exregs1", "%exregs2", "%exregs3", "%exregs4", - "%exregs5", "%exregs6", "%exregs7", - /* r24 ~ r31 */ - "%exregs8", "%exregs9", "%exregs10", "%exregs11", "%exregs12", - "%exregs13", "%exregs14", "%tls", - "%pc", NULL, NULL, NULL, "%hi", "%lo", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "%epc", -}; - -#define CSKY_ABIV1_MAX_REGS 57 -const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = { - /* r0 ~ r8 */ - "%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", - /* r9 ~ r15 */ - "%regs2", "%regs3", "%regs4", "%regs5", "%regs6", "%regs7", "%regs8", - "%lr", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "%epc", -}; - -const char *get_csky_regstr(unsigned int n, unsigned int flags) -{ - if (flags & EF_CSKY_ABIV2) - return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL; - - return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL; -} diff --git a/tools/perf/util/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-powerpc.c deleted file mode 100644 index caf77a234c78..000000000000 --- a/tools/perf/util/dwarf-regs-powerpc.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Mapping of DWARF debug register numbers into register names. - * - * Copyright (C) 2010 Ian Munsie, IBM Corporation. - */ - -#include - -#define PPC_OP(op) (((op) >> 26) & 0x3F) -#define PPC_RA(a) (((a) >> 16) & 0x1f) -#define PPC_RT(t) (((t) >> 21) & 0x1f) -#define PPC_RB(b) (((b) >> 11) & 0x1f) -#define PPC_D(D) ((D) & 0xfffe) -#define PPC_DS(DS) ((DS) & 0xfffc) -#define OP_LD 58 -#define OP_STD 62 - -static int get_source_reg(u32 raw_insn) -{ - return PPC_RA(raw_insn); -} - -static int get_target_reg(u32 raw_insn) -{ - return PPC_RT(raw_insn); -} - -static int get_offset_opcode(u32 raw_insn) -{ - int opcode = PPC_OP(raw_insn); - - /* DS- form */ - if ((opcode == OP_LD) || (opcode == OP_STD)) - return PPC_DS(raw_insn); - else - return PPC_D(raw_insn); -} - -/* - * Fills the required fields for op_loc depending on if it - * is a source or target. - * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT - * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT - * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT - */ -void get_powerpc_regs(u32 raw_insn, int is_source, - struct annotated_op_loc *op_loc) -{ - if (is_source) - op_loc->reg1 = get_source_reg(raw_insn); - else - op_loc->reg1 = get_target_reg(raw_insn); - - if (op_loc->multi_regs) - op_loc->reg2 = PPC_RB(raw_insn); - - /* TODO: Implement offset handling for X Form */ - if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31)) - op_loc->offset = get_offset_opcode(raw_insn); -} diff --git a/tools/perf/util/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-x86.c deleted file mode 100644 index 7a55c65e8da6..000000000000 --- a/tools/perf/util/dwarf-regs-x86.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * dwarf-regs.c : Mapping of DWARF debug register numbers into register names. - * Extracted from probe-finder.c - * - * Written by Masami Hiramatsu - */ - -#include /* for EINVAL */ -#include /* for strcmp */ -#include /* for ARRAY_SIZE */ -#include - -struct dwarf_regs_idx { - const char *name; - int idx; -}; - -static const struct dwarf_regs_idx x86_regidx_table[] = { - { "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 }, - { "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 }, - { "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 }, - { "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 }, - { "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 }, - { "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 }, - { "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 }, - { "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 }, - { "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 }, - { "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 }, - { "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 }, - { "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 }, - { "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 }, - { "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 }, - { "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 }, - { "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 }, - { "rip", DWARF_REG_PC }, -}; - -int get_x86_regnum(const char *name) -{ - unsigned int i; - - if (*name != '%') - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++) - if (!strcmp(x86_regidx_table[i].name, name + 1)) - return x86_regidx_table[i].idx; - return -ENOENT; -} -- cgit v1.2.3 From 3a00f41646bbcb45aff17bb4ba27c52c6bab4f68 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:36 -0800 Subject: perf dwarf-regs: Remove get_arch_regnum() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Except in dwarf-regs the function is never called. The weak function has no strong arch implementations. Remove so that the fall-through case applies. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs.c | 12 ------------ tools/perf/util/include/dwarf-regs.h | 4 ---- 2 files changed, 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 28a1cfdf26d4..b2f37299147e 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -71,13 +71,6 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int return NULL; } -#if EM_HOST != EM_X86_64 && EM_HOST != EM_386 -__weak int get_arch_regnum(const char *name __maybe_unused) -{ - return -ENOTSUP; -} -#endif - /* Return DWARF register number from architecture register name */ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags __maybe_unused) { @@ -98,11 +91,6 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags machine = EM_HOST; } switch (machine) { -#if EM_HOST != EM_X86_64 && EM_HOST != EM_386 - case EM_HOST: - reg = get_arch_regnum(regname); - break; -#endif case EM_X86_64: fallthrough; case EM_386: diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 6f1b9f6b2466..015d1ade645f 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -101,10 +101,6 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int int get_x86_regnum(const char *name); -#if !defined(__x86_64__) && !defined(__i386__) -int get_arch_regnum(const char *name); -#endif - /* * get_dwarf_regnum - Returns DWARF regnum from register name * name: architecture register name -- cgit v1.2.3 From c31040085914f1188720073baa43d1483693c0a3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:37 -0800 Subject: perf dwarf-regs: Clean up x86 dwarf_regnum code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The i386 and x86-64 register numbers differ on x86, but previously x86 was a single arch string and so this couldn't be handled. The transition to using ELF EM_* values means we can translate x86 registers correctly for either the x86-64 dwarf register mappings (from the System V ABI) or i386 register mappings. Correct the mappings. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c | 133 +++++++++++++++++++++-- tools/perf/util/dwarf-regs.c | 5 +- tools/perf/util/include/dwarf-regs.h | 3 +- 3 files changed, 129 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c index 7a55c65e8da6..f0c42e4d7423 100644 --- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c @@ -13,10 +13,65 @@ struct dwarf_regs_idx { const char *name; - int idx; + int dwarf_regnum; }; -static const struct dwarf_regs_idx x86_regidx_table[] = { +static const struct dwarf_regs_idx i386_regidx_table[] = { + { "eax", 0 }, { "ax", 0 }, { "al", 0 }, + { "ecx", 1 }, { "cx", 1 }, { "cl", 1 }, + { "edx", 2 }, { "dx", 2 }, { "dl", 2 }, + { "ebx", 3 }, { "bx", 3 }, { "bl", 3 }, + { "esp", 4 }, { "sp", 4 }, { "$stack", 4}, + { "ebp", 5 }, { "bp", 5 }, + { "esi", 6 }, { "si", 6 }, + { "edi", 7 }, { "di", 7 }, + // 8 - Return Address RA + { "eflags", 9}, { "flags", 9}, + // 10 - reserved + { "st0", 11}, + { "st1", 12}, + { "st2", 13}, + { "st3", 14}, + { "st4", 15}, + { "st5", 16}, + { "st6", 17}, + { "st7", 18}, + // 19-20 - reserved + { "xmm0", 21}, + { "xmm1", 22}, + { "xmm2", 23}, + { "xmm3", 24}, + { "xmm4", 25}, + { "xmm5", 26}, + { "xmm6", 27}, + { "xmm7", 28}, + { "mm0", 29}, + { "mm1", 30}, + { "mm2", 31}, + { "mm3", 32}, + { "mm4", 33}, + { "mm5", 34}, + { "mm6", 35}, + { "mm7", 36}, + // 37-38 - unknown + { "mxcsr", 39}, // 128-bit Media Control and Status + { "es", 40}, + { "cs", 41}, + { "ss", 42}, + { "ds", 43}, + { "fs", 44}, + { "gs", 45}, + // 46-47 - reserved + { "tr", 48}, // Task Register + { "ldtr", 49}, // LDT Register + // 50-92 - reserved + { "fs.base", 92}, + { "gs.base", 93}, + // End of regular dwarf registers. + { "eip", DWARF_REG_PC }, { "ip", DWARF_REG_PC }, +}; + +static const struct dwarf_regs_idx x86_64_regidx_table[] = { { "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 }, { "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 }, { "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 }, @@ -33,18 +88,78 @@ static const struct dwarf_regs_idx x86_regidx_table[] = { { "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 }, { "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 }, { "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 }, - { "rip", DWARF_REG_PC }, + // 16 - Return Address RA + { "xmm0", 17}, + { "xmm1", 18}, + { "xmm2", 19}, + { "xmm3", 20}, + { "xmm4", 21}, + { "xmm5", 22}, + { "xmm6", 23}, + { "xmm7", 24}, + { "xmm8", 25}, + { "xmm9", 26}, + { "xmm10", 27}, + { "xmm11", 28}, + { "xmm12", 29}, + { "xmm13", 30}, + { "xmm14", 31}, + { "xmm15", 32}, + { "st0", 33}, + { "st1", 34}, + { "st2", 35}, + { "st3", 36}, + { "st4", 37}, + { "st5", 38}, + { "st6", 39}, + { "st7", 40}, + { "mm0", 41}, + { "mm1", 42}, + { "mm2", 43}, + { "mm3", 44}, + { "mm4", 45}, + { "mm5", 46}, + { "mm6", 47}, + { "mm7", 48}, + { "rflags", 49}, { "eflags", 49}, { "flags", 49}, + { "es", 50}, + { "cs", 51}, + { "ss", 52}, + { "ds", 53}, + { "fs", 54}, + { "gs", 55}, + // 56-47 - reserved + { "fs.base", 58}, + { "gs.base", 59}, + // 60-61 - reserved + { "tr", 62}, // Task Register + { "ldtr", 63}, // LDT Register + { "mxcsr", 64}, // 128-bit Media Control and Status + { "fcw", 65}, // x87 Control Word + { "fsw", 66}, // x87 Status Word + // End of regular dwarf registers. + { "rip", DWARF_REG_PC }, { "eip", DWARF_REG_PC }, { "ip", DWARF_REG_PC }, }; -int get_x86_regnum(const char *name) +static int get_regnum(const struct dwarf_regs_idx *entries, size_t num_entries, const char *name) { - unsigned int i; - if (*name != '%') return -EINVAL; - for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++) - if (!strcmp(x86_regidx_table[i].name, name + 1)) - return x86_regidx_table[i].idx; + name++; + for (size_t i = 0; i < num_entries; i++) { + if (!strcmp(entries[i].name, name)) + return entries[i].dwarf_regnum; + } return -ENOENT; } + +int __get_dwarf_regnum_i386(const char *name) +{ + return get_regnum(i386_regidx_table, ARRAY_SIZE(i386_regidx_table), name); +} + +int __get_dwarf_regnum_x86_64(const char *name) +{ + return get_regnum(x86_64_regidx_table, ARRAY_SIZE(x86_64_regidx_table), name); +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index b2f37299147e..ef249dd589e3 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -92,9 +92,10 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags } switch (machine) { case EM_X86_64: - fallthrough; + reg = __get_dwarf_regnum_x86_64(name); + break; case EM_386: - reg = get_x86_regnum(regname); + reg = __get_dwarf_regnum_i386(name); break; default: pr_err("ELF MACHINE %x is not supported.\n", machine); diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 015d1ade645f..bb5413b0fee4 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -99,7 +99,8 @@ const char *get_csky_regstr(unsigned int n, unsigned int flags); */ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags); -int get_x86_regnum(const char *name); +int __get_dwarf_regnum_i386(const char *name); +int __get_dwarf_regnum_x86_64(const char *name); /* * get_dwarf_regnum - Returns DWARF regnum from register name -- cgit v1.2.3 From d3ab52c31efab9e8a29b8fc1ae4c09ab41e0cf84 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:38 -0800 Subject: perf dwarf-regs: Add get_dwarf_regnum_for_perf_regnum() and use for x86 unwinding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a utility to map a perf register number to a DWARF register number for a particular ELF machine type. Create a generic unwind-libdw initial register initialization routine that uses this function and thereby avoids arch specific initialization. The unwind-libdw code does: 1) compute the maximum DWARF register from the set of sampled user registers, 2) allocates a set of DWARF registers, 3) copies the sample registers into the appropriate DWARF registers. This generic solution is initially implemented for use with x86 as only get_dwarf_regnum_for_perf_regnum() support for x86 is currently present. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c | 95 ++++++++++++++++++++++ tools/perf/util/dwarf-regs.c | 55 +++++++++++++ tools/perf/util/include/dwarf-regs.h | 8 ++ tools/perf/util/unwind-libdw-arch/Build | 1 - .../perf/util/unwind-libdw-arch/unwind-libdw-x86.c | 54 ------------ tools/perf/util/unwind-libdw.c | 70 ++++++++++++++-- tools/perf/util/unwind-libdw.h | 2 +- 7 files changed, 222 insertions(+), 63 deletions(-) delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c index f0c42e4d7423..cadef120aeb4 100644 --- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c @@ -10,6 +10,7 @@ #include /* for strcmp */ #include /* for ARRAY_SIZE */ #include +#include "../../../arch/x86/include/uapi/asm/perf_regs.h" struct dwarf_regs_idx { const char *name; @@ -163,3 +164,97 @@ int __get_dwarf_regnum_x86_64(const char *name) { return get_regnum(x86_64_regidx_table, ARRAY_SIZE(x86_64_regidx_table), name); } + +int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum) +{ + static const int dwarf_i386_regnums[] = { + [PERF_REG_X86_AX] = 0, + [PERF_REG_X86_BX] = 3, + [PERF_REG_X86_CX] = 1, + [PERF_REG_X86_DX] = 2, + [PERF_REG_X86_SI] = 6, + [PERF_REG_X86_DI] = 7, + [PERF_REG_X86_BP] = 5, + [PERF_REG_X86_SP] = 4, + [PERF_REG_X86_IP] = 8, + [PERF_REG_X86_FLAGS] = 9, + [PERF_REG_X86_CS] = 41, + [PERF_REG_X86_SS] = 42, + [PERF_REG_X86_DS] = 43, + [PERF_REG_X86_ES] = 40, + [PERF_REG_X86_FS] = 44, + [PERF_REG_X86_GS] = 45, + [PERF_REG_X86_XMM0] = 21, + [PERF_REG_X86_XMM1] = 22, + [PERF_REG_X86_XMM2] = 23, + [PERF_REG_X86_XMM3] = 24, + [PERF_REG_X86_XMM4] = 25, + [PERF_REG_X86_XMM5] = 26, + [PERF_REG_X86_XMM6] = 27, + [PERF_REG_X86_XMM7] = 28, + }; + + if (perf_regnum == 0) + return 0; + + if (perf_regnum < 0 || perf_regnum > (int)ARRAY_SIZE(dwarf_i386_regnums) || + dwarf_i386_regnums[perf_regnum] == 0) + return -ENOENT; + + return dwarf_i386_regnums[perf_regnum]; +} + +int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum) +{ + static const int dwarf_x86_64_regnums[] = { + [PERF_REG_X86_AX] = 0, + [PERF_REG_X86_BX] = 3, + [PERF_REG_X86_CX] = 2, + [PERF_REG_X86_DX] = 1, + [PERF_REG_X86_SI] = 4, + [PERF_REG_X86_DI] = 5, + [PERF_REG_X86_BP] = 6, + [PERF_REG_X86_SP] = 7, + [PERF_REG_X86_IP] = 16, + [PERF_REG_X86_FLAGS] = 49, + [PERF_REG_X86_CS] = 51, + [PERF_REG_X86_SS] = 52, + [PERF_REG_X86_DS] = 53, + [PERF_REG_X86_ES] = 50, + [PERF_REG_X86_FS] = 54, + [PERF_REG_X86_GS] = 55, + [PERF_REG_X86_R8] = 8, + [PERF_REG_X86_R9] = 9, + [PERF_REG_X86_R10] = 10, + [PERF_REG_X86_R11] = 11, + [PERF_REG_X86_R12] = 12, + [PERF_REG_X86_R13] = 13, + [PERF_REG_X86_R14] = 14, + [PERF_REG_X86_R15] = 15, + [PERF_REG_X86_XMM0] = 17, + [PERF_REG_X86_XMM1] = 18, + [PERF_REG_X86_XMM2] = 19, + [PERF_REG_X86_XMM3] = 20, + [PERF_REG_X86_XMM4] = 21, + [PERF_REG_X86_XMM5] = 22, + [PERF_REG_X86_XMM6] = 23, + [PERF_REG_X86_XMM7] = 24, + [PERF_REG_X86_XMM8] = 25, + [PERF_REG_X86_XMM9] = 26, + [PERF_REG_X86_XMM10] = 27, + [PERF_REG_X86_XMM11] = 28, + [PERF_REG_X86_XMM12] = 29, + [PERF_REG_X86_XMM13] = 30, + [PERF_REG_X86_XMM14] = 31, + [PERF_REG_X86_XMM15] = 32, + }; + + if (perf_regnum == 0) + return 0; + + if (perf_regnum < 0 || perf_regnum > (int)ARRAY_SIZE(dwarf_x86_64_regnums) || + dwarf_x86_64_regnums[perf_regnum] == 0) + return -ENOENT; + + return dwarf_x86_64_regnums[perf_regnum]; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index ef249dd589e3..1f7d892612df 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -103,3 +103,58 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags free(regname); return reg; } + +static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __maybe_unused) +{ + switch (machine) { + case EM_X86_64: + return 17; + case EM_386: + return 9; + case EM_ARM: + return 16; + case EM_AARCH64: + return 97; + case EM_CSKY: + return 38; + case EM_S390: + return 32; + case EM_PPC: + case EM_PPC64: + return 145; + case EM_RISCV: + return 66; + case EM_SPARC: + case EM_SPARCV9: + return 103; + case EM_LOONGARCH: + return 74; + default: + return 0; + } +} + +int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, + unsigned int flags, bool only_libdw_supported) +{ + int reg; + + switch (machine) { + case EM_X86_64: + reg = __get_dwarf_regnum_for_perf_regnum_x86_64(perf_regnum); + break; + case EM_386: + reg = __get_dwarf_regnum_for_perf_regnum_i386(perf_regnum); + break; + default: + pr_err("ELF MACHINE %x is not supported.\n", machine); + return -ENOENT; + } + if (reg >= 0 && only_libdw_supported) { + int nregs = get_libdw_frame_nregs(machine, flags); + + if (reg >= nregs) + reg = -ENOENT; + } + return reg; +} diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index bb5413b0fee4..00881f1d45d6 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -101,6 +101,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int int __get_dwarf_regnum_i386(const char *name); int __get_dwarf_regnum_x86_64(const char *name); +int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum); /* * get_dwarf_regnum - Returns DWARF regnum from register name @@ -109,6 +111,12 @@ int __get_dwarf_regnum_x86_64(const char *name); */ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags); +/* + * get_dwarf_regnum - Returns DWARF regnum from perf register number. + */ +int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, unsigned int flags, + bool only_libdw_supported); + void get_powerpc_regs(u32 raw_insn, int is_source, struct annotated_op_loc *op_loc); #else /* HAVE_LIBDW_SUPPORT */ diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build index ef17a83a7813..5b5682029953 100644 --- a/tools/perf/util/unwind-libdw-arch/Build +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -1,4 +1,3 @@ -perf-util-y += unwind-libdw-x86.o perf-util-y += unwind-libdw-arm.o perf-util-y += unwind-libdw-arm64.o perf-util-y += unwind-libdw-csky.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c deleted file mode 100644 index dd27545a4a68..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "../arch/x86/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[17]; - unsigned nregs; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_X86_##r); \ - val; \ -}) - - if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) { - dwarf_regs[0] = REG(AX); - dwarf_regs[1] = REG(CX); - dwarf_regs[2] = REG(DX); - dwarf_regs[3] = REG(BX); - dwarf_regs[4] = REG(SP); - dwarf_regs[5] = REG(BP); - dwarf_regs[6] = REG(SI); - dwarf_regs[7] = REG(DI); - dwarf_regs[8] = REG(IP); - nregs = 9; - } else { - dwarf_regs[0] = REG(AX); - dwarf_regs[1] = REG(DX); - dwarf_regs[2] = REG(CX); - dwarf_regs[3] = REG(BX); - dwarf_regs[4] = REG(SI); - dwarf_regs[5] = REG(DI); - dwarf_regs[6] = REG(BP); - dwarf_regs[7] = REG(SP); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - dwarf_regs[16] = REG(IP); - nregs = 17; - } - - return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs); -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index c25190cdceb4..055dab921442 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -6,6 +6,7 @@ #include #include "debug.h" #include "dso.h" +#include #include "unwind.h" #include "unwind-libdw.h" #include "machine.h" @@ -225,6 +226,59 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * return true; } +static bool libdw_set_initial_registers_generic(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); + Dwarf_Word *dwarf_regs; + int max_dwarf_reg = 0; + bool ret; + uint16_t e_machine = ui->e_machine; + int e_flags = 0; + uint64_t ip_perf_reg = perf_arch_reg_ip(e_machine); + Dwarf_Word val = 0; + + + /* + * For every possible perf register in the bitmap determine the dwarf + * register and use to compute the max. + */ + for (int perf_reg = 0; perf_reg < 64; perf_reg++) { + if (user_regs->mask & (1ULL << perf_reg)) { + int dwarf_reg = + get_dwarf_regnum_for_perf_regnum(perf_reg, e_machine, + e_flags, + /*only_libdw_supported=*/true); + if (dwarf_reg > max_dwarf_reg) + max_dwarf_reg = dwarf_reg; + } + } + + dwarf_regs = calloc(max_dwarf_reg + 1, sizeof(*dwarf_regs)); + if (!dwarf_regs) + return false; + + for (int perf_reg = 0; perf_reg < 64; perf_reg++) { + if (user_regs->mask & (1ULL << perf_reg)) { + int dwarf_reg = + get_dwarf_regnum_for_perf_regnum(perf_reg, e_machine, + e_flags, + /*only_libdw_supported=*/true); + if (dwarf_reg >= 0) { + val = 0; + if (perf_reg_value(&val, user_regs, perf_reg) == 0) + dwarf_regs[dwarf_reg] = val; + } + } + } + if (perf_reg_value(&val, user_regs, ip_perf_reg) == 0) + dwfl_thread_state_register_pc(thread, val); + + ret = dwfl_thread_state_registers(thread, 0, max_dwarf_reg + 1, dwarf_regs); + free(dwarf_regs); + return ret; +} + #define DEFINE_DWFL_THREAD_CALLBACKS(arch) \ static const Dwfl_Thread_Callbacks callbacks_##arch = { \ .next_thread = next_thread, \ @@ -232,7 +286,12 @@ static const Dwfl_Thread_Callbacks callbacks_##arch = { \ .set_initial_registers = libdw_set_initial_registers_##arch, \ } -DEFINE_DWFL_THREAD_CALLBACKS(x86); +static const Dwfl_Thread_Callbacks callbacks_generic = { + .next_thread = next_thread, + .memory_read = memory_read, + .set_initial_registers = libdw_set_initial_registers_generic, +}; + DEFINE_DWFL_THREAD_CALLBACKS(arm); DEFINE_DWFL_THREAD_CALLBACKS(arm64); DEFINE_DWFL_THREAD_CALLBACKS(csky); @@ -257,12 +316,8 @@ static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) return &callbacks_riscv; else if (!strcmp(arch, "s390")) return &callbacks_s390; - else if (!strcmp(arch, "x86")) - return &callbacks_x86; - pr_err("Fail to get thread callbacks for arch %s, returns NULL\n", - arch); - return NULL; + return &callbacks_generic; } static int @@ -301,6 +356,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, bool best_effort) { struct machine *machine = maps__machine(thread__maps(thread)); + uint16_t e_machine = thread__e_machine(thread, machine); struct unwind_info *ui, ui_buf = { .sample = data, .thread = thread, @@ -308,9 +364,9 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, .cb = cb, .arg = arg, .max_stack = max_stack, + .e_machine = e_machine, .best_effort = best_effort }; - uint16_t e_machine = thread__e_machine(thread, machine); const char *arch = perf_env__arch(machine->env); Dwarf_Word ip; int err = -EINVAL, i; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 574b29848cce..496e5898e7ef 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -9,7 +9,6 @@ struct machine; struct perf_sample; struct thread; -bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg); @@ -28,6 +27,7 @@ struct unwind_info { void *arg; int max_stack; int idx; + uint16_t e_machine; bool best_effort; struct unwind_entry entries[]; }; -- cgit v1.2.3 From cf7c7f12042b9e9dfb7e63c9c3180b6af1860b2b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:39 -0800 Subject: perf dwarf-regs: Add basic get_dwarf_regnum() for most architectures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a basic get_dwarf_regnum() implementation for most architectures by using the get_dwarf_regstr() tables and returning the index of the name within the table. Some minor name and constification clean up for csky. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c | 24 ++++++++-- tools/perf/util/dwarf-regs.c | 58 +++++++++++++++++++++-- tools/perf/util/include/dwarf-regs.h | 5 +- 3 files changed, 78 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c index d38ef1f07f3e..86394ed46397 100644 --- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c @@ -2,11 +2,12 @@ // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. // Mapping of DWARF debug register numbers into register names. +#include #include #include #define CSKY_ABIV2_MAX_REGS 73 -const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = { +static const char * const csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = { /* r0 ~ r8 */ "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3", /* r9 ~ r15 */ @@ -27,7 +28,7 @@ const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = { }; #define CSKY_ABIV1_MAX_REGS 57 -const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = { +static const char * const csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = { /* r0 ~ r8 */ "%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", /* r9 ~ r15 */ @@ -41,10 +42,27 @@ const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = { "%epc", }; -const char *get_csky_regstr(unsigned int n, unsigned int flags) +const char *__get_csky_regstr(unsigned int n, unsigned int flags) { if (flags & EF_CSKY_ABIV2) return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL; return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL; } + +static int __get_dwarf_regnum(const char *const *regstr, size_t num_regstr, const char *name) +{ + for (size_t i = 0; i < num_regstr; i++) { + if (regstr[i] && !strcmp(regstr[i], name)) + return i; + } + return -ENOENT; +} + +int __get_csky_regnum(const char *name, unsigned int flags) +{ + if (flags & EF_CSKY_ABIV2) + return __get_dwarf_regnum(csky_dwarf_regs_table_abiv2, CSKY_ABIV2_MAX_REGS, name); + + return __get_dwarf_regnum(csky_dwarf_regs_table_abiv1, CSKY_ABIV1_MAX_REGS, name); +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 1f7d892612df..dffa0c8bdd14 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -27,11 +27,11 @@ #include "../arch/mips/include/dwarf-regs-table.h" #include "../arch/loongarch/include/dwarf-regs-table.h" -#define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL) - /* Return architecture dependent register string (for kprobe-tracer) */ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags) { + #define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL) + if (machine == EM_NONE) { /* Generic arch - use host arch */ machine = EM_HOST; @@ -46,7 +46,7 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int case EM_AARCH64: return __get_dwarf_regstr(aarch64_regstr_tbl, n); case EM_CSKY: - return get_csky_regstr(n, flags); + return __get_csky_regstr(n, flags); case EM_SH: return __get_dwarf_regstr(sh_regstr_tbl, n); case EM_S390: @@ -69,15 +69,28 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int pr_err("ELF MACHINE %x is not supported.\n", machine); } return NULL; + + #undef __get_dwarf_regstr +} + +static int __get_dwarf_regnum(const char *const *regstr, size_t num_regstr, const char *name) +{ + for (size_t i = 0; i < num_regstr; i++) { + if (regstr[i] && !strcmp(regstr[i], name)) + return i; + } + return -ENOENT; } /* Return DWARF register number from architecture register name */ -int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags __maybe_unused) +int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags) { char *regname = strdup(name); int reg = -1; char *p; + #define _get_dwarf_regnum(tbl, name) __get_dwarf_regnum(tbl, ARRAY_SIZE(tbl), name) + if (regname == NULL) return -EINVAL; @@ -97,11 +110,48 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags case EM_386: reg = __get_dwarf_regnum_i386(name); break; + case EM_ARM: + reg = _get_dwarf_regnum(arm_regstr_tbl, name); + break; + case EM_AARCH64: + reg = _get_dwarf_regnum(aarch64_regstr_tbl, name); + break; + case EM_CSKY: + reg = __get_csky_regnum(name, flags); + break; + case EM_SH: + reg = _get_dwarf_regnum(sh_regstr_tbl, name); + break; + case EM_S390: + reg = _get_dwarf_regnum(s390_regstr_tbl, name); + break; + case EM_PPC: + case EM_PPC64: + reg = _get_dwarf_regnum(powerpc_regstr_tbl, name); + break; + case EM_RISCV: + reg = _get_dwarf_regnum(riscv_regstr_tbl, name); + break; + case EM_SPARC: + case EM_SPARCV9: + reg = _get_dwarf_regnum(sparc_regstr_tbl, name); + break; + case EM_XTENSA: + reg = _get_dwarf_regnum(xtensa_regstr_tbl, name); + break; + case EM_MIPS: + reg = _get_dwarf_regnum(mips_regstr_tbl, name); + break; + case EM_LOONGARCH: + reg = _get_dwarf_regnum(loongarch_regstr_tbl, name); + break; default: pr_err("ELF MACHINE %x is not supported.\n", machine); } free(regname); return reg; + + #undef _get_dwarf_regnum } static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __maybe_unused) diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 00881f1d45d6..a120c97a5fac 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -89,8 +89,6 @@ #define DWARF_REG_FB 0xd3affb /* random number */ #ifdef HAVE_LIBDW_SUPPORT -const char *get_csky_regstr(unsigned int n, unsigned int flags); - /** * get_dwarf_regstr() - Returns ftrace register string from DWARF regnum. * @n: DWARF register number. @@ -99,6 +97,9 @@ const char *get_csky_regstr(unsigned int n, unsigned int flags); */ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags); +const char *__get_csky_regstr(unsigned int n, unsigned int flags); +int __get_csky_regnum(const char *name, unsigned int flags); + int __get_dwarf_regnum_i386(const char *name); int __get_dwarf_regnum_x86_64(const char *name); int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum); -- cgit v1.2.3 From 8b863e70e2be6c256201d2297735a2a4bf1acf75 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:40 -0800 Subject: perf dwarf-regs: Add ARM perf to dwarf register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions allow the generic initial register state code in unwind-libdw to be used. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/Build | 2 + tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c | 12 +++++ tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c | 12 +++++ tools/perf/util/dwarf-regs.c | 6 +++ tools/perf/util/include/dwarf-regs.h | 3 ++ tools/perf/util/unwind-libdw-arch/Build | 2 - .../perf/util/unwind-libdw-arch/unwind-libdw-arm.c | 39 -------------- .../util/unwind-libdw-arch/unwind-libdw-arm64.c | 61 ---------------------- tools/perf/util/unwind-libdw.c | 8 +-- tools/perf/util/unwind-libdw.h | 2 - 10 files changed, 36 insertions(+), 111 deletions(-) create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build index 98bec0032606..3f19a9ec47c7 100644 --- a/tools/perf/util/dwarf-regs-arch/Build +++ b/tools/perf/util/dwarf-regs-arch/Build @@ -1,3 +1,5 @@ +perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c new file mode 100644 index 000000000000..42c6c0635612 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../../arch/arm/include/uapi/asm/perf_regs.h" + +int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum) +{ + if (perf_regnum < 0 || perf_regnum >= PERF_REG_ARM_MAX) + return -ENOENT; + + return perf_regnum; +} diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c new file mode 100644 index 000000000000..593ca7d4fccc --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../../arch/arm64/include/uapi/asm/perf_regs.h" + +int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum) +{ + if (perf_regnum < 0 || perf_regnum >= PERF_REG_ARM64_MAX) + return -ENOENT; + + return perf_regnum; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index dffa0c8bdd14..c472ec5e4d1a 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -196,6 +196,12 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_386: reg = __get_dwarf_regnum_for_perf_regnum_i386(perf_regnum); break; + case EM_ARM: + reg = __get_dwarf_regnum_for_perf_regnum_arm(perf_regnum); + break; + case EM_AARCH64: + reg = __get_dwarf_regnum_for_perf_regnum_arm64(perf_regnum); + break; default: pr_err("ELF MACHINE %x is not supported.\n", machine); return -ENOENT; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index a120c97a5fac..a52df8d1b138 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -105,6 +105,9 @@ int __get_dwarf_regnum_x86_64(const char *name); int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum); + /* * get_dwarf_regnum - Returns DWARF regnum from register name * name: architecture register name diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build index 5b5682029953..79c3bbdc2dee 100644 --- a/tools/perf/util/unwind-libdw-arch/Build +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -1,5 +1,3 @@ -perf-util-y += unwind-libdw-arm.o -perf-util-y += unwind-libdw-arm64.o perf-util-y += unwind-libdw-csky.o perf-util-y += unwind-libdw-loongarch.o perf-util-y += unwind-libdw-powerpc.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c deleted file mode 100644 index 56e9b5975bcc..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "../arch/arm/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_ARM_##r); \ - val; \ -}) - - dwarf_regs[0] = REG(R0); - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(FP); - dwarf_regs[12] = REG(IP); - dwarf_regs[13] = REG(SP); - dwarf_regs[14] = REG(LR); - dwarf_regs[15] = REG(PC); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX, - dwarf_regs); -} diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c deleted file mode 100644 index 29b6833e036c..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "../arch/arm64/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r); \ - val; \ -}) - - dwarf_regs[0] = REG(X0); - dwarf_regs[1] = REG(X1); - dwarf_regs[2] = REG(X2); - dwarf_regs[3] = REG(X3); - dwarf_regs[4] = REG(X4); - dwarf_regs[5] = REG(X5); - dwarf_regs[6] = REG(X6); - dwarf_regs[7] = REG(X7); - dwarf_regs[8] = REG(X8); - dwarf_regs[9] = REG(X9); - dwarf_regs[10] = REG(X10); - dwarf_regs[11] = REG(X11); - dwarf_regs[12] = REG(X12); - dwarf_regs[13] = REG(X13); - dwarf_regs[14] = REG(X14); - dwarf_regs[15] = REG(X15); - dwarf_regs[16] = REG(X16); - dwarf_regs[17] = REG(X17); - dwarf_regs[18] = REG(X18); - dwarf_regs[19] = REG(X19); - dwarf_regs[20] = REG(X20); - dwarf_regs[21] = REG(X21); - dwarf_regs[22] = REG(X22); - dwarf_regs[23] = REG(X23); - dwarf_regs[24] = REG(X24); - dwarf_regs[25] = REG(X25); - dwarf_regs[26] = REG(X26); - dwarf_regs[27] = REG(X27); - dwarf_regs[28] = REG(X28); - dwarf_regs[29] = REG(X29); - dwarf_regs[30] = REG(LR); - dwarf_regs[31] = REG(SP); - - if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX, - dwarf_regs)) - return false; - - dwarf_pc = REG(PC); - dwfl_thread_state_register_pc(thread, dwarf_pc); - - return true; -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 055dab921442..8f291f9f9469 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -292,8 +292,6 @@ static const Dwfl_Thread_Callbacks callbacks_generic = { .set_initial_registers = libdw_set_initial_registers_generic, }; -DEFINE_DWFL_THREAD_CALLBACKS(arm); -DEFINE_DWFL_THREAD_CALLBACKS(arm64); DEFINE_DWFL_THREAD_CALLBACKS(csky); DEFINE_DWFL_THREAD_CALLBACKS(loongarch); DEFINE_DWFL_THREAD_CALLBACKS(powerpc); @@ -302,11 +300,7 @@ DEFINE_DWFL_THREAD_CALLBACKS(s390); static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) { - if (!strcmp(arch, "arm")) - return &callbacks_arm; - else if (!strcmp(arch, "arm64")) - return &callbacks_arm64; - else if (!strcmp(arch, "csky")) + if (!strcmp(arch, "csky")) return &callbacks_csky; else if (!strcmp(arch, "loongarch")) return &callbacks_loongarch; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 496e5898e7ef..fe3ae2a768ad 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -9,8 +9,6 @@ struct machine; struct perf_sample; struct thread; -bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg); -bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); -- cgit v1.2.3 From 8cac4013b0c23739ccbce19f74c1b572eba050d2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:41 -0800 Subject: perf dwarf-regs: Add csky perf to dwarf register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions allow the generic initial register state code in unwind-libdw to be used. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c | 58 ++++++++++++++++ tools/perf/util/dwarf-regs.c | 3 + tools/perf/util/include/dwarf-regs.h | 2 + tools/perf/util/unwind-libdw-arch/Build | 1 - .../util/unwind-libdw-arch/unwind-libdw-csky.c | 78 ---------------------- tools/perf/util/unwind-libdw.c | 5 +- tools/perf/util/unwind-libdw.h | 1 - 7 files changed, 64 insertions(+), 84 deletions(-) delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c index 86394ed46397..cb44b774f8d9 100644 --- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c @@ -5,6 +5,10 @@ #include #include #include +// Ensure the V2 perf reg definitions are included. +#undef __CSKYABIV2__ +#define __CSKYABIV2__ 1 +#include "../../../arch/csky/include/uapi/asm/perf_regs.h" #define CSKY_ABIV2_MAX_REGS 73 static const char * const csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = { @@ -66,3 +70,57 @@ int __get_csky_regnum(const char *name, unsigned int flags) return __get_dwarf_regnum(csky_dwarf_regs_table_abiv1, CSKY_ABIV1_MAX_REGS, name); } + +int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags) +{ + static const int dwarf_csky_regnums[][2] = { + [PERF_REG_CSKY_TLS] = {-ENOENT, 31}, + [PERF_REG_CSKY_LR] = {15, 15}, + [PERF_REG_CSKY_PC] = {-ENOENT, 32}, + /* TODO: PERF_REG_CSKY_SR */ + [PERF_REG_CSKY_SP] = {0, 14}, + /* TODO: PERF_REG_CSKY_ORIG_A0 */ + [PERF_REG_CSKY_A0] = {2, 0}, + [PERF_REG_CSKY_A1] = {3, 1}, + [PERF_REG_CSKY_A2] = {4, 2}, + [PERF_REG_CSKY_A3] = {5, 3}, + [PERF_REG_CSKY_REGS0] = {6, 4}, + [PERF_REG_CSKY_REGS1] = {7, 5}, + [PERF_REG_CSKY_REGS2] = {8, 6}, + [PERF_REG_CSKY_REGS3] = {9, 7}, + [PERF_REG_CSKY_REGS4] = {10, 8}, + [PERF_REG_CSKY_REGS5] = {11, 9}, + [PERF_REG_CSKY_REGS6] = {12, 10}, + [PERF_REG_CSKY_REGS7] = {13, 11}, + [PERF_REG_CSKY_REGS8] = {14, 12}, + [PERF_REG_CSKY_REGS9] = {1, 13}, + [PERF_REG_CSKY_EXREGS0] = {-ENOENT, 16}, + [PERF_REG_CSKY_EXREGS1] = {-ENOENT, 17}, + [PERF_REG_CSKY_EXREGS2] = {-ENOENT, 18}, + [PERF_REG_CSKY_EXREGS3] = {-ENOENT, 19}, + [PERF_REG_CSKY_EXREGS4] = {-ENOENT, 20}, + [PERF_REG_CSKY_EXREGS5] = {-ENOENT, 21}, + [PERF_REG_CSKY_EXREGS6] = {-ENOENT, 22}, + [PERF_REG_CSKY_EXREGS7] = {-ENOENT, 23}, + [PERF_REG_CSKY_EXREGS8] = {-ENOENT, 24}, + [PERF_REG_CSKY_EXREGS9] = {-ENOENT, 25}, + [PERF_REG_CSKY_EXREGS10] = {-ENOENT, 26}, + [PERF_REG_CSKY_EXREGS11] = {-ENOENT, 27}, + [PERF_REG_CSKY_EXREGS12] = {-ENOENT, 28}, + [PERF_REG_CSKY_EXREGS13] = {-ENOENT, 29}, + [PERF_REG_CSKY_EXREGS14] = {-ENOENT, 30}, + /* TODO: PERF_REG_CSKY_HI */ + /* TODO: PERF_REG_CSKY_LO */ + /* TODO: PERF_REG_CSKY_DCSR */ + }; + int idx = 0; + + if (flags & EF_CSKY_ABIV2) + idx++; + + if (perf_regnum < 0 || perf_regnum > (int)ARRAY_SIZE(dwarf_csky_regnums) || + dwarf_csky_regnums[perf_regnum][idx] == 0) + return -ENOENT; + + return dwarf_csky_regnums[perf_regnum][idx]; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index c472ec5e4d1a..7fa0930fd298 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -202,6 +202,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_AARCH64: reg = __get_dwarf_regnum_for_perf_regnum_arm64(perf_regnum); break; + case EM_CSKY: + reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags); + break; default: pr_err("ELF MACHINE %x is not supported.\n", machine); return -ENOENT; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index a52df8d1b138..7780bc07e70e 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -108,6 +108,8 @@ int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags); + /* * get_dwarf_regnum - Returns DWARF regnum from register name * name: architecture register name diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build index 79c3bbdc2dee..5fa1754fca8d 100644 --- a/tools/perf/util/unwind-libdw-arch/Build +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -1,4 +1,3 @@ -perf-util-y += unwind-libdw-csky.o perf-util-y += unwind-libdw-loongarch.o perf-util-y += unwind-libdw-powerpc.o perf-util-y += unwind-libdw-riscv.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c deleted file mode 100644 index 2556d034c32a..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. - -#include -#include "../arch/csky/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r); \ - val; \ -}) - -#if defined(__CSKYABIV2__) - dwarf_regs[0] = REG(A0); - dwarf_regs[1] = REG(A1); - dwarf_regs[2] = REG(A2); - dwarf_regs[3] = REG(A3); - dwarf_regs[4] = REG(REGS0); - dwarf_regs[5] = REG(REGS1); - dwarf_regs[6] = REG(REGS2); - dwarf_regs[7] = REG(REGS3); - dwarf_regs[8] = REG(REGS4); - dwarf_regs[9] = REG(REGS5); - dwarf_regs[10] = REG(REGS6); - dwarf_regs[11] = REG(REGS7); - dwarf_regs[12] = REG(REGS8); - dwarf_regs[13] = REG(REGS9); - dwarf_regs[14] = REG(SP); - dwarf_regs[15] = REG(LR); - dwarf_regs[16] = REG(EXREGS0); - dwarf_regs[17] = REG(EXREGS1); - dwarf_regs[18] = REG(EXREGS2); - dwarf_regs[19] = REG(EXREGS3); - dwarf_regs[20] = REG(EXREGS4); - dwarf_regs[21] = REG(EXREGS5); - dwarf_regs[22] = REG(EXREGS6); - dwarf_regs[23] = REG(EXREGS7); - dwarf_regs[24] = REG(EXREGS8); - dwarf_regs[25] = REG(EXREGS9); - dwarf_regs[26] = REG(EXREGS10); - dwarf_regs[27] = REG(EXREGS11); - dwarf_regs[28] = REG(EXREGS12); - dwarf_regs[29] = REG(EXREGS13); - dwarf_regs[30] = REG(EXREGS14); - dwarf_regs[31] = REG(TLS); - dwarf_regs[32] = REG(PC); -#else - dwarf_regs[0] = REG(SP); - dwarf_regs[1] = REG(REGS9); - dwarf_regs[2] = REG(A0); - dwarf_regs[3] = REG(A1); - dwarf_regs[4] = REG(A2); - dwarf_regs[5] = REG(A3); - dwarf_regs[6] = REG(REGS0); - dwarf_regs[7] = REG(REGS1); - dwarf_regs[8] = REG(REGS2); - dwarf_regs[9] = REG(REGS3); - dwarf_regs[10] = REG(REGS4); - dwarf_regs[11] = REG(REGS5); - dwarf_regs[12] = REG(REGS6); - dwarf_regs[13] = REG(REGS7); - dwarf_regs[14] = REG(REGS8); - dwarf_regs[15] = REG(LR); -#endif - dwfl_thread_state_register_pc(thread, REG(PC)); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX, - dwarf_regs); -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 8f291f9f9469..a193163da707 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -292,7 +292,6 @@ static const Dwfl_Thread_Callbacks callbacks_generic = { .set_initial_registers = libdw_set_initial_registers_generic, }; -DEFINE_DWFL_THREAD_CALLBACKS(csky); DEFINE_DWFL_THREAD_CALLBACKS(loongarch); DEFINE_DWFL_THREAD_CALLBACKS(powerpc); DEFINE_DWFL_THREAD_CALLBACKS(riscv); @@ -300,9 +299,7 @@ DEFINE_DWFL_THREAD_CALLBACKS(s390); static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) { - if (!strcmp(arch, "csky")) - return &callbacks_csky; - else if (!strcmp(arch, "loongarch")) + if (!strcmp(arch, "loongarch")) return &callbacks_loongarch; else if (!strcmp(arch, "powerpc")) return &callbacks_powerpc; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index fe3ae2a768ad..ee56f1e827e5 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -9,7 +9,6 @@ struct machine; struct perf_sample; struct thread; -bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg); -- cgit v1.2.3 From 1f10d82e6adffd45cf1a59618d1ecc33625a8a37 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:42 -0800 Subject: perf dwarf-regs: Add loongarch perf to DWARF register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions allow the generic initial register state code in unwind-libdw to be used. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/Build | 1 + .../util/dwarf-regs-arch/dwarf-regs-loongarch.c | 12 +++++ tools/perf/util/dwarf-regs.c | 3 ++ tools/perf/util/include/dwarf-regs.h | 1 + tools/perf/util/unwind-libdw-arch/Build | 1 - .../unwind-libdw-arch/unwind-libdw-loongarch.c | 57 ---------------------- tools/perf/util/unwind-libdw.c | 5 +- tools/perf/util/unwind-libdw.h | 1 - 8 files changed, 18 insertions(+), 63 deletions(-) create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build index 3f19a9ec47c7..188359376ea5 100644 --- a/tools/perf/util/dwarf-regs-arch/Build +++ b/tools/perf/util/dwarf-regs-arch/Build @@ -1,5 +1,6 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c new file mode 100644 index 000000000000..203077b740a0 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../../arch/loongarch/include/uapi/asm/perf_regs.h" + +int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum) +{ + if (perf_regnum < 0 || perf_regnum >= PERF_REG_LOONGARCH_MAX) + return -ENOENT; + + return perf_regnum; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 7fa0930fd298..033218f14b36 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -205,6 +205,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_CSKY: reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags); break; + case EM_LOONGARCH: + reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum); + break; default: pr_err("ELF MACHINE %x is not supported.\n", machine); return -ENOENT; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 7780bc07e70e..bec15fb53e73 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -109,6 +109,7 @@ int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags); +int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum); /* * get_dwarf_regnum - Returns DWARF regnum from register name diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build index 5fa1754fca8d..62a4cbf2dca8 100644 --- a/tools/perf/util/unwind-libdw-arch/Build +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -1,4 +1,3 @@ -perf-util-y += unwind-libdw-loongarch.o perf-util-y += unwind-libdw-powerpc.o perf-util-y += unwind-libdw-riscv.o perf-util-y += unwind-libdw-s390.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c deleted file mode 100644 index 5fca673508be..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ - -#include -#include "../arch/loongarch/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r); \ - val; \ -}) - - dwarf_regs[0] = 0; - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - dwarf_regs[16] = REG(R16); - dwarf_regs[17] = REG(R17); - dwarf_regs[18] = REG(R18); - dwarf_regs[19] = REG(R19); - dwarf_regs[20] = REG(R20); - dwarf_regs[21] = REG(R21); - dwarf_regs[22] = REG(R22); - dwarf_regs[23] = REG(R23); - dwarf_regs[24] = REG(R24); - dwarf_regs[25] = REG(R25); - dwarf_regs[26] = REG(R26); - dwarf_regs[27] = REG(R27); - dwarf_regs[28] = REG(R28); - dwarf_regs[29] = REG(R29); - dwarf_regs[30] = REG(R30); - dwarf_regs[31] = REG(R31); - dwfl_thread_state_register_pc(thread, REG(PC)); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs); -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index a193163da707..9c8dad643cd0 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -292,16 +292,13 @@ static const Dwfl_Thread_Callbacks callbacks_generic = { .set_initial_registers = libdw_set_initial_registers_generic, }; -DEFINE_DWFL_THREAD_CALLBACKS(loongarch); DEFINE_DWFL_THREAD_CALLBACKS(powerpc); DEFINE_DWFL_THREAD_CALLBACKS(riscv); DEFINE_DWFL_THREAD_CALLBACKS(s390); static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) { - if (!strcmp(arch, "loongarch")) - return &callbacks_loongarch; - else if (!strcmp(arch, "powerpc")) + if (!strcmp(arch, "powerpc")) return &callbacks_powerpc; else if (!strcmp(arch, "riscv")) return &callbacks_riscv; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index ee56f1e827e5..9d177d70f15c 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -9,7 +9,6 @@ struct machine; struct perf_sample; struct thread; -bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg); -- cgit v1.2.3 From f005302294601a8fb770c71179a3a13951d125ad Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:43 -0800 Subject: perf dwarf-regs: Add powerpc perf to DWARF register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions allow the generic initial register state code in unwind-libdw to be used. Note, the link register was being coped to DWARF register 65 that the SysV ABI spec claims is FPSCR. It is corrected here to 108, but this is unlikely to matter as FPSCR has little to no impact on unwinding. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c | 77 +++++++++++++++++++++- tools/perf/util/dwarf-regs.c | 4 ++ tools/perf/util/include/dwarf-regs.h | 1 + tools/perf/util/unwind-libdw-arch/Build | 1 - .../util/unwind-libdw-arch/unwind-libdw-powerpc.c | 76 --------------------- tools/perf/util/unwind-libdw.c | 5 +- tools/perf/util/unwind-libdw.h | 1 - 7 files changed, 82 insertions(+), 83 deletions(-) delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c index caf77a234c78..51892a09725b 100644 --- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c @@ -4,8 +4,9 @@ * * Copyright (C) 2010 Ian Munsie, IBM Corporation. */ - +#include #include +#include "../../../arch/powerpc/include/uapi/asm/perf_regs.h" #define PPC_OP(op) (((op) >> 26) & 0x3F) #define PPC_RA(a) (((a) >> 16) & 0x1f) @@ -59,3 +60,77 @@ void get_powerpc_regs(u32 raw_insn, int is_source, if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31)) op_loc->offset = get_offset_opcode(raw_insn); } + +int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum) +{ + static const int dwarf_powerpc_regnums[] = { + [PERF_REG_POWERPC_R0] = 0, + [PERF_REG_POWERPC_R1] = 1, + [PERF_REG_POWERPC_R2] = 2, + [PERF_REG_POWERPC_R3] = 3, + [PERF_REG_POWERPC_R4] = 4, + [PERF_REG_POWERPC_R5] = 5, + [PERF_REG_POWERPC_R6] = 6, + [PERF_REG_POWERPC_R7] = 7, + [PERF_REG_POWERPC_R8] = 8, + [PERF_REG_POWERPC_R9] = 9, + [PERF_REG_POWERPC_R10] = 10, + [PERF_REG_POWERPC_R11] = 11, + [PERF_REG_POWERPC_R12] = 12, + [PERF_REG_POWERPC_R13] = 13, + [PERF_REG_POWERPC_R14] = 14, + [PERF_REG_POWERPC_R15] = 15, + [PERF_REG_POWERPC_R16] = 16, + [PERF_REG_POWERPC_R17] = 17, + [PERF_REG_POWERPC_R18] = 18, + [PERF_REG_POWERPC_R19] = 19, + [PERF_REG_POWERPC_R20] = 20, + [PERF_REG_POWERPC_R21] = 21, + [PERF_REG_POWERPC_R22] = 22, + [PERF_REG_POWERPC_R23] = 23, + [PERF_REG_POWERPC_R24] = 24, + [PERF_REG_POWERPC_R25] = 25, + [PERF_REG_POWERPC_R26] = 26, + [PERF_REG_POWERPC_R27] = 27, + [PERF_REG_POWERPC_R28] = 28, + [PERF_REG_POWERPC_R29] = 29, + [PERF_REG_POWERPC_R30] = 30, + [PERF_REG_POWERPC_R31] = 31, + /* TODO: PERF_REG_POWERPC_NIP */ + [PERF_REG_POWERPC_MSR] = 66, + /* TODO: PERF_REG_POWERPC_ORIG_R3 */ + [PERF_REG_POWERPC_CTR] = 109, + [PERF_REG_POWERPC_LINK] = 108, /* Note, previously in perf encoded as 65? */ + [PERF_REG_POWERPC_XER] = 101, + /* TODO: PERF_REG_POWERPC_CCR */ + /* TODO: PERF_REG_POWERPC_SOFTE */ + /* TODO: PERF_REG_POWERPC_TRAP */ + /* TODO: PERF_REG_POWERPC_DAR */ + /* TODO: PERF_REG_POWERPC_DSISR */ + /* TODO: PERF_REG_POWERPC_SIER */ + /* TODO: PERF_REG_POWERPC_MMCRA */ + /* TODO: PERF_REG_POWERPC_MMCR0 */ + /* TODO: PERF_REG_POWERPC_MMCR1 */ + /* TODO: PERF_REG_POWERPC_MMCR2 */ + /* TODO: PERF_REG_POWERPC_MMCR3 */ + /* TODO: PERF_REG_POWERPC_SIER2 */ + /* TODO: PERF_REG_POWERPC_SIER3 */ + /* TODO: PERF_REG_POWERPC_PMC1 */ + /* TODO: PERF_REG_POWERPC_PMC2 */ + /* TODO: PERF_REG_POWERPC_PMC3 */ + /* TODO: PERF_REG_POWERPC_PMC4 */ + /* TODO: PERF_REG_POWERPC_PMC5 */ + /* TODO: PERF_REG_POWERPC_PMC6 */ + /* TODO: PERF_REG_POWERPC_SDAR */ + /* TODO: PERF_REG_POWERPC_SIAR */ + }; + + if (perf_regnum == 0) + return 0; + + if (perf_regnum < 0 || perf_regnum > (int)ARRAY_SIZE(dwarf_powerpc_regnums) || + dwarf_powerpc_regnums[perf_regnum] == 0) + return -ENOENT; + + return dwarf_powerpc_regnums[perf_regnum]; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 033218f14b36..3b1c2a436806 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -205,6 +205,10 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_CSKY: reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags); break; + case EM_PPC: + case EM_PPC64: + reg = __get_dwarf_regnum_for_perf_regnum_powerpc(perf_regnum); + break; case EM_LOONGARCH: reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum); break; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index bec15fb53e73..9ebb3ba33fba 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -110,6 +110,7 @@ int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags); int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum); /* * get_dwarf_regnum - Returns DWARF regnum from register name diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build index 62a4cbf2dca8..e6c97e842cd6 100644 --- a/tools/perf/util/unwind-libdw-arch/Build +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -1,3 +1,2 @@ -perf-util-y += unwind-libdw-powerpc.o perf-util-y += unwind-libdw-riscv.o perf-util-y += unwind-libdw-s390.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c deleted file mode 100644 index 1560db45e7b4..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "../arch/powerpc/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils. */ -static const int special_regs[3][2] = { - { 65, PERF_REG_POWERPC_LINK }, - { 101, PERF_REG_POWERPC_XER }, - { 109, PERF_REG_POWERPC_CTR }, -}; - -bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[32], dwarf_nip; - size_t i; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r); \ - val; \ -}) - - dwarf_regs[0] = REG(R0); - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - dwarf_regs[16] = REG(R16); - dwarf_regs[17] = REG(R17); - dwarf_regs[18] = REG(R18); - dwarf_regs[19] = REG(R19); - dwarf_regs[20] = REG(R20); - dwarf_regs[21] = REG(R21); - dwarf_regs[22] = REG(R22); - dwarf_regs[23] = REG(R23); - dwarf_regs[24] = REG(R24); - dwarf_regs[25] = REG(R25); - dwarf_regs[26] = REG(R26); - dwarf_regs[27] = REG(R27); - dwarf_regs[28] = REG(R28); - dwarf_regs[29] = REG(R29); - dwarf_regs[30] = REG(R30); - dwarf_regs[31] = REG(R31); - if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs)) - return false; - - dwarf_nip = REG(NIP); - dwfl_thread_state_register_pc(thread, dwarf_nip); - for (i = 0; i < ARRAY_SIZE(special_regs); i++) { - Dwarf_Word val = 0; - perf_reg_value(&val, user_regs, special_regs[i][1]); - if (!dwfl_thread_state_registers(thread, - special_regs[i][0], 1, - &val)) - return false; - } - - return true; -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 9c8dad643cd0..e9ba050e7ab1 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -292,15 +292,12 @@ static const Dwfl_Thread_Callbacks callbacks_generic = { .set_initial_registers = libdw_set_initial_registers_generic, }; -DEFINE_DWFL_THREAD_CALLBACKS(powerpc); DEFINE_DWFL_THREAD_CALLBACKS(riscv); DEFINE_DWFL_THREAD_CALLBACKS(s390); static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) { - if (!strcmp(arch, "powerpc")) - return &callbacks_powerpc; - else if (!strcmp(arch, "riscv")) + if (!strcmp(arch, "riscv")) return &callbacks_riscv; else if (!strcmp(arch, "s390")) return &callbacks_s390; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 9d177d70f15c..0ec1abdabbe7 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -10,7 +10,6 @@ struct perf_sample; struct thread; bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); -bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg); -- cgit v1.2.3 From 36b372dfff51a0f069d4f8f11991b7241743fd52 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:44 -0800 Subject: perf dwarf-regs: Add RISC-V perf to DWARF register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions allow the generic initial register state code in unwind-libdw to be used. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/Build | 1 + tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c | 12 +++++ tools/perf/util/dwarf-regs.c | 3 ++ tools/perf/util/include/dwarf-regs.h | 1 + tools/perf/util/unwind-libdw-arch/Build | 1 - .../util/unwind-libdw-arch/unwind-libdw-riscv.c | 58 ---------------------- tools/perf/util/unwind-libdw.c | 5 +- tools/perf/util/unwind-libdw.h | 1 - 8 files changed, 18 insertions(+), 64 deletions(-) create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build index 188359376ea5..94e4dfceb4d1 100644 --- a/tools/perf/util/dwarf-regs-arch/Build +++ b/tools/perf/util/dwarf-regs-arch/Build @@ -3,4 +3,5 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c new file mode 100644 index 000000000000..090db51aba41 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../../arch/riscv/include/uapi/asm/perf_regs.h" + +int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum) +{ + if (perf_regnum < 0 || perf_regnum >= PERF_REG_RISCV_MAX) + return -ENOENT; + + return perf_regnum; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 3b1c2a436806..137568e15018 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -209,6 +209,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_PPC64: reg = __get_dwarf_regnum_for_perf_regnum_powerpc(perf_regnum); break; + case EM_RISCV: + reg = __get_dwarf_regnum_for_perf_regnum_riscv(perf_regnum); + break; case EM_LOONGARCH: reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum); break; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 9ebb3ba33fba..ae76608da110 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -111,6 +111,7 @@ int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags); int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum); /* * get_dwarf_regnum - Returns DWARF regnum from register name diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build index e6c97e842cd6..6d6e319e1201 100644 --- a/tools/perf/util/unwind-libdw-arch/Build +++ b/tools/perf/util/unwind-libdw-arch/Build @@ -1,2 +1 @@ -perf-util-y += unwind-libdw-riscv.o perf-util-y += unwind-libdw-s390.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c deleted file mode 100644 index c2e2c4b6b2e0..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */ - -#include -#include "../arch/riscv/include/uapi/asm/perf_regs.h" -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/sample.h" - -bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[32]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r); \ - val; \ -}) - - dwarf_regs[0] = 0; - dwarf_regs[1] = REG(RA); - dwarf_regs[2] = REG(SP); - dwarf_regs[3] = REG(GP); - dwarf_regs[4] = REG(TP); - dwarf_regs[5] = REG(T0); - dwarf_regs[6] = REG(T1); - dwarf_regs[7] = REG(T2); - dwarf_regs[8] = REG(S0); - dwarf_regs[9] = REG(S1); - dwarf_regs[10] = REG(A0); - dwarf_regs[11] = REG(A1); - dwarf_regs[12] = REG(A2); - dwarf_regs[13] = REG(A3); - dwarf_regs[14] = REG(A4); - dwarf_regs[15] = REG(A5); - dwarf_regs[16] = REG(A6); - dwarf_regs[17] = REG(A7); - dwarf_regs[18] = REG(S2); - dwarf_regs[19] = REG(S3); - dwarf_regs[20] = REG(S4); - dwarf_regs[21] = REG(S5); - dwarf_regs[22] = REG(S6); - dwarf_regs[23] = REG(S7); - dwarf_regs[24] = REG(S8); - dwarf_regs[25] = REG(S9); - dwarf_regs[26] = REG(S10); - dwarf_regs[27] = REG(S11); - dwarf_regs[28] = REG(T3); - dwarf_regs[29] = REG(T4); - dwarf_regs[30] = REG(T5); - dwarf_regs[31] = REG(T6); - dwfl_thread_state_register_pc(thread, REG(PC)); - - return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX, - dwarf_regs); -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index e9ba050e7ab1..b3c4380d40b6 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -292,14 +292,11 @@ static const Dwfl_Thread_Callbacks callbacks_generic = { .set_initial_registers = libdw_set_initial_registers_generic, }; -DEFINE_DWFL_THREAD_CALLBACKS(riscv); DEFINE_DWFL_THREAD_CALLBACKS(s390); static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) { - if (!strcmp(arch, "riscv")) - return &callbacks_riscv; - else if (!strcmp(arch, "s390")) + if (!strcmp(arch, "s390")) return &callbacks_s390; return &callbacks_generic; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 0ec1abdabbe7..5c23080cb6c1 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -10,7 +10,6 @@ struct perf_sample; struct thread; bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); -bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg); bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg); struct unwind_info { -- cgit v1.2.3 From 1e452dd850f3d509cdc9da05f2c70161f7f73d37 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:45 -0800 Subject: perf dwarf-regs: Add S390 perf to DWARF register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions allow the generic initial register state code in unwind-libdw to be used. Now the non-generic code in unwind-libdw has no uses remove it. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/Build | 1 + tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c | 53 ++++++++++++++++++ tools/perf/util/dwarf-regs.c | 3 + tools/perf/util/include/dwarf-regs.h | 1 + tools/perf/util/unwind-libdw-arch/Build | 1 - .../util/unwind-libdw-arch/unwind-libdw-s390.c | 65 ---------------------- tools/perf/util/unwind-libdw.c | 31 ++--------- tools/perf/util/unwind-libdw.h | 1 - 8 files changed, 62 insertions(+), 94 deletions(-) create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c delete mode 100644 tools/perf/util/unwind-libdw-arch/Build delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build index 94e4dfceb4d1..10c2af3d933a 100644 --- a/tools/perf/util/dwarf-regs-arch/Build +++ b/tools/perf/util/dwarf-regs-arch/Build @@ -4,4 +4,5 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-s390.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c new file mode 100644 index 000000000000..310a37451bdc --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../../arch/s390/include/uapi/asm/perf_regs.h" + +int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum) +{ + static const int dwarf_s390_regnums[] = { + [PERF_REG_S390_R0] = 0, + [PERF_REG_S390_R1] = 1, + [PERF_REG_S390_R2] = 2, + [PERF_REG_S390_R3] = 3, + [PERF_REG_S390_R4] = 4, + [PERF_REG_S390_R5] = 5, + [PERF_REG_S390_R6] = 6, + [PERF_REG_S390_R7] = 7, + [PERF_REG_S390_R8] = 8, + [PERF_REG_S390_R9] = 9, + [PERF_REG_S390_R10] = 10, + [PERF_REG_S390_R11] = 11, + [PERF_REG_S390_R12] = 12, + [PERF_REG_S390_R13] = 13, + [PERF_REG_S390_R14] = 14, + [PERF_REG_S390_R15] = 15, + [PERF_REG_S390_FP0] = 16, + [PERF_REG_S390_FP1] = 20, + [PERF_REG_S390_FP2] = 17, + [PERF_REG_S390_FP3] = 21, + [PERF_REG_S390_FP4] = 18, + [PERF_REG_S390_FP5] = 22, + [PERF_REG_S390_FP6] = 19, + [PERF_REG_S390_FP7] = 23, + [PERF_REG_S390_FP8] = 24, + [PERF_REG_S390_FP9] = 28, + [PERF_REG_S390_FP10] = 25, + [PERF_REG_S390_FP11] = 29, + [PERF_REG_S390_FP12] = 26, + [PERF_REG_S390_FP13] = 30, + [PERF_REG_S390_FP14] = 27, + [PERF_REG_S390_FP15] = 31, + [PERF_REG_S390_MASK] = 64, + [PERF_REG_S390_PC] = 65, + }; + + if (perf_regnum == 0) + return 0; + + if (perf_regnum < 0 || perf_regnum > (int)ARRAY_SIZE(dwarf_s390_regnums) || + dwarf_s390_regnums[perf_regnum] == 0) + return -ENOENT; + + return dwarf_s390_regnums[perf_regnum]; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 137568e15018..f86f76547592 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -212,6 +212,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_RISCV: reg = __get_dwarf_regnum_for_perf_regnum_riscv(perf_regnum); break; + case EM_S390: + reg = __get_dwarf_regnum_for_perf_regnum_s390(perf_regnum); + break; case EM_LOONGARCH: reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum); break; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index ae76608da110..b95cf2d7b5b3 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -112,6 +112,7 @@ int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags) int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum); /* * get_dwarf_regnum - Returns DWARF regnum from register name diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build deleted file mode 100644 index 6d6e319e1201..000000000000 --- a/tools/perf/util/unwind-libdw-arch/Build +++ /dev/null @@ -1 +0,0 @@ -perf-util-y += unwind-libdw-s390.o diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c deleted file mode 100644 index 1e05e9d9d95f..000000000000 --- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include "util/unwind-libdw.h" -#include "util/perf_regs.h" -#include "util/event.h" -#include "util/sample.h" -#include "../arch/s390/include/dwarf-regs-table.h" -#include "../arch/s390/include/uapi/asm/perf_regs.h" - - -bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg) -{ - struct unwind_info *ui = arg; - struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); - Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)]; - -#define REG(r) ({ \ - Dwarf_Word val = 0; \ - perf_reg_value(&val, user_regs, PERF_REG_S390_##r); \ - val; \ -}) - /* - * For DWARF register mapping details, - * see also perf/arch/s390/include/dwarf-regs-table.h - */ - dwarf_regs[0] = REG(R0); - dwarf_regs[1] = REG(R1); - dwarf_regs[2] = REG(R2); - dwarf_regs[3] = REG(R3); - dwarf_regs[4] = REG(R4); - dwarf_regs[5] = REG(R5); - dwarf_regs[6] = REG(R6); - dwarf_regs[7] = REG(R7); - dwarf_regs[8] = REG(R8); - dwarf_regs[9] = REG(R9); - dwarf_regs[10] = REG(R10); - dwarf_regs[11] = REG(R11); - dwarf_regs[12] = REG(R12); - dwarf_regs[13] = REG(R13); - dwarf_regs[14] = REG(R14); - dwarf_regs[15] = REG(R15); - - dwarf_regs[16] = REG(FP0); - dwarf_regs[17] = REG(FP2); - dwarf_regs[18] = REG(FP4); - dwarf_regs[19] = REG(FP6); - dwarf_regs[20] = REG(FP1); - dwarf_regs[21] = REG(FP3); - dwarf_regs[22] = REG(FP5); - dwarf_regs[23] = REG(FP7); - dwarf_regs[24] = REG(FP8); - dwarf_regs[25] = REG(FP10); - dwarf_regs[26] = REG(FP12); - dwarf_regs[27] = REG(FP14); - dwarf_regs[28] = REG(FP9); - dwarf_regs[29] = REG(FP11); - dwarf_regs[30] = REG(FP13); - dwarf_regs[31] = REG(FP15); - - dwarf_regs[64] = REG(MASK); - dwarf_regs[65] = REG(PC); - - dwfl_thread_state_register_pc(thread, dwarf_regs[65]); - return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs); -} diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index b3c4380d40b6..e0321043af88 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -226,7 +226,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * return true; } -static bool libdw_set_initial_registers_generic(Dwfl_Thread *thread, void *arg) +static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg) { struct unwind_info *ui = arg; struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); @@ -279,29 +279,12 @@ static bool libdw_set_initial_registers_generic(Dwfl_Thread *thread, void *arg) return ret; } -#define DEFINE_DWFL_THREAD_CALLBACKS(arch) \ -static const Dwfl_Thread_Callbacks callbacks_##arch = { \ - .next_thread = next_thread, \ - .memory_read = memory_read, \ - .set_initial_registers = libdw_set_initial_registers_##arch, \ -} - -static const Dwfl_Thread_Callbacks callbacks_generic = { +static const Dwfl_Thread_Callbacks callbacks = { .next_thread = next_thread, .memory_read = memory_read, - .set_initial_registers = libdw_set_initial_registers_generic, + .set_initial_registers = libdw_set_initial_registers, }; -DEFINE_DWFL_THREAD_CALLBACKS(s390); - -static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch) -{ - if (!strcmp(arch, "s390")) - return &callbacks_s390; - - return &callbacks_generic; -} - static int frame_callback(Dwfl_Frame *state, void *arg) { @@ -349,10 +332,8 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, .e_machine = e_machine, .best_effort = best_effort }; - const char *arch = perf_env__arch(machine->env); Dwarf_Word ip; int err = -EINVAL, i; - const Dwfl_Thread_Callbacks *callbacks; if (!data->user_regs || !data->user_regs->regs) return -EINVAL; @@ -375,11 +356,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (err) goto out; - callbacks = get_thread_callbacks(arch); - if (!callbacks) - goto out; - - err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), callbacks, ui); + err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks, ui); if (err) goto out; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 5c23080cb6c1..20d63d881dff 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -10,7 +10,6 @@ struct perf_sample; struct thread; bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); -bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg); struct unwind_info { Dwfl *dwfl; -- cgit v1.2.3 From 406b51a9a5e8d3c3d862a6eebe3def7e11229693 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:46 -0800 Subject: perf dwarf-regs: Add MIPS perf to DWARF register number mapping functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite an unused function declaration, there was no unwind-libdw for MIPS but there is a perf_regs.h and a libdw implementation. Fill in the pieces so hopefully MIPS unwinding with libdw works. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-regs-arch/Build | 1 + tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c | 14 ++++++++++++++ tools/perf/util/dwarf-regs.c | 5 +++++ tools/perf/util/include/dwarf-regs.h | 1 + tools/perf/util/unwind-libdw.h | 2 -- 5 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c (limited to 'tools') diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build index 10c2af3d933a..ceb68ae86fd8 100644 --- a/tools/perf/util/dwarf-regs-arch/Build +++ b/tools/perf/util/dwarf-regs-arch/Build @@ -2,6 +2,7 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o +perf-util-$(CONFIG_LIBDW) += dwarf-regs-mips.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o perf-util-$(CONFIG_LIBDW) += dwarf-regs-s390.o diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c new file mode 100644 index 000000000000..3bb916b45c66 --- /dev/null +++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../../arch/mips/include/uapi/asm/perf_regs.h" + +int __get_dwarf_regnum_for_perf_regnum_mips(int perf_regnum) +{ + if (perf_regnum == PERF_REG_MIPS_PC) + return 37; + if (perf_regnum < 0 || perf_regnum >= PERF_REG_MIPS_MAX) + return -ENOENT; + + return perf_regnum; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index f86f76547592..797f455eba0d 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -179,6 +179,8 @@ static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __mayb return 103; case EM_LOONGARCH: return 74; + case EM_MIPS: + return 71; default: return 0; } @@ -218,6 +220,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, case EM_LOONGARCH: reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum); break; + case EM_MIPS: + reg = __get_dwarf_regnum_for_perf_regnum_mips(perf_regnum); + break; default: pr_err("ELF MACHINE %x is not supported.\n", machine); return -ENOENT; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index b95cf2d7b5b3..46a764cf322f 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -113,6 +113,7 @@ int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum); int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum); +int __get_dwarf_regnum_for_perf_regnum_mips(int perf_regnum); /* * get_dwarf_regnum - Returns DWARF regnum from register name diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 20d63d881dff..9c5b5fcaaae8 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -9,8 +9,6 @@ struct machine; struct perf_sample; struct thread; -bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg); - struct unwind_info { Dwfl *dwfl; struct perf_sample *sample; -- cgit v1.2.3 From 2e9191573a69ff962b018d85a2c58269a1637b27 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:47 -0800 Subject: perf build: Remove NO_LIBDW_DWARF_UNWIND option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Libdw unwinding support is present for every architecture that has a perf_regs.h - perf registers are needed for the initial frame to unwind. Elfutils also supports SPARC, ARC and m68k but there is no support in the Linux kernel for perf registers on these architectures. As the perf supported DWARF unwinding architectures are a subset of the elfutils ones, remove NO_LIBDW_DWARF_UNWIND as there isn't a case of elfutils lacking the support need for perf. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 19 +------------------ tools/perf/tests/make | 3 +-- tools/perf/util/Build | 3 +-- 3 files changed, 3 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 6f2c7bd36e74..5e4ae775987f 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -118,14 +118,6 @@ ifeq ($(ARCH),mips) endif endif -# So far there's only x86 and arm libdw unwind support merged in perf. -# Disable it on all other architectures in case libdw unwind -# support is detected in system. Add supported architectures -# to the check. -ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc s390 csky riscv loongarch)) - NO_LIBDW_DWARF_UNWIND := 1 -endif - ifneq ($(LIBUNWIND),1) NO_LIBUNWIND := 1 endif @@ -456,7 +448,6 @@ endif ifdef NO_LIBELF NO_LIBDW := 1 NO_LIBUNWIND := 1 - NO_LIBDW_DWARF_UNWIND := 1 NO_LIBBPF := 1 NO_JVMTI := 1 else @@ -504,10 +495,6 @@ ifeq ($(feature-libaio), 1) endif endif -ifdef NO_LIBDW - NO_LIBDW_DWARF_UNWIND := 1 -endif - ifeq ($(feature-scandirat), 1) # Ignore having scandirat with memory sanitizer that lacks an interceptor. ifeq ($(filter s% -fsanitize=memory%,$(EXTRA_CFLAGS),),) @@ -757,7 +744,7 @@ dwarf-post-unwind-text := BUG # setup DWARF post unwinder ifdef NO_LIBUNWIND - ifdef NO_LIBDW_DWARF_UNWIND + ifdef NO_LIBDW $(warning Disabling post unwind, no support found.) dwarf-post-unwind := 0 else @@ -767,10 +754,6 @@ ifdef NO_LIBUNWIND else dwarf-post-unwind-text := libunwind $(call detected,CONFIG_LIBUNWIND) - # Enable libunwind support by default. - ifndef NO_LIBDW_DWARF_UNWIND - NO_LIBDW_DWARF_UNWIND := 1 - endif endif ifeq ($(dwarf-post-unwind),1) diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 36411b4b6d2b..767ad9e147a8 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -83,7 +83,6 @@ make_no_demangle := NO_DEMANGLE=1 make_no_libelf := NO_LIBELF=1 make_no_libdw := NO_LIBDW=1 make_libunwind := LIBUNWIND=1 -make_no_libdw_dwarf_unwind := NO_LIBDW_DWARF_UNWIND=1 make_no_backtrace := NO_BACKTRACE=1 make_no_libcapstone := NO_CAPSTONE=1 make_no_libnuma := NO_LIBNUMA=1 @@ -120,7 +119,7 @@ make_static := LDFLAGS=-static NO_PERF_READ_VDSO32=1 NO_PERF_READ_VDSOX3 make_minimal := NO_LIBPYTHON=1 NO_GTK2=1 make_minimal += NO_DEMANGLE=1 NO_LIBELF=1 NO_BACKTRACE=1 make_minimal += NO_LIBNUMA=1 NO_LIBBIONIC=1 NO_LIBDW=1 -make_minimal += NO_LIBDW_DWARF_UNWIND=1 NO_LIBBPF=1 +make_minimal += NO_LIBBPF=1 make_minimal += NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1 make_minimal += NO_LIBCAP=1 NO_CAPSTONE=1 diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 3cb1edd263cf..c30ff257f8b4 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -223,9 +223,8 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arch/ perf-util-$(CONFIG_LIBDW) += debuginfo.o perf-util-$(CONFIG_LIBDW) += annotate-data.o perf-util-$(CONFIG_LIBDW) += libdw.o +perf-util-$(CONFIG_LIBDW) += unwind-libdw.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o -perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw-arch/ perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind-local.o perf-util-$(CONFIG_LIBUNWIND) += unwind-libunwind.o perf-util-$(CONFIG_LIBUNWIND_X86) += libunwind/x86_32.o -- cgit v1.2.3 From 6b2658b3f36a7e524d7a8957e729e307484f2a8c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:48 -0800 Subject: perf unwind-libdw: Don't discard loaded ELF/DWARF after every unwind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unwind-libdw dwfl has ELF binaries associated with mmap addresses. Experimenting with using the per dso dwfl it is required to alter the address to be 0 based variant. Unfortunately libdwfl doesn't allow a single unwind and then an update to the return address to be 0 based as there are assertions that registers aren't updated once an unwind has started, etc. As removing the dwfl didn't prove possible, an alternative is to just not discard the dwfl when the unwind ends. The dwfl is valid for a process unless a dso is loaded at the same address as a previous one. So keep the dwfl with the maps, invalidate it if a map is removed (in case a new map replaces it) and recycle the dwfl in the unwinding code. A wrinkly in the implementation of this is that the attached thread argument is remembered by the dwfl and so it needs to be a pointer to memory that also persists with the dwfl (struct dwfl_ui_thread_info in the code). Recording 10 seconds of system wide data with --call-graph=dwarf and then processing with perf report shows a total runtime improvement from 41.583s to 2.279s (an 18x speedup). Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 36 ++++++++++++++++- tools/perf/util/maps.h | 4 ++ tools/perf/util/unwind-libdw.c | 90 +++++++++++++++++++++++++++++++----------- tools/perf/util/unwind-libdw.h | 9 ++++- 4 files changed, 112 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index c321d4f4d846..8ccc46d515b6 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -10,6 +10,7 @@ #include "thread.h" #include "ui/ui.h" #include "unwind.h" +#include "unwind-libdw.h" #include /* @@ -39,6 +40,9 @@ DECLARE_RC_STRUCT(maps) { #ifdef HAVE_LIBUNWIND_SUPPORT void *addr_space; const struct unwind_libunwind_ops *unwind_libunwind_ops; +#endif +#ifdef HAVE_LIBDW_SUPPORT + void *libdw_addr_space_dwfl; #endif refcount_t refcnt; /** @@ -203,6 +207,17 @@ void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libun RC_CHK_ACCESS(maps)->unwind_libunwind_ops = ops; } #endif +#ifdef HAVE_LIBDW_SUPPORT +void *maps__libdw_addr_space_dwfl(const struct maps *maps) +{ + return RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl; +} + +void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl) +{ + RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = dwfl; +} +#endif static struct rw_semaphore *maps__lock(struct maps *maps) { @@ -218,6 +233,9 @@ static void maps__init(struct maps *maps, struct machine *machine) #ifdef HAVE_LIBUNWIND_SUPPORT RC_CHK_ACCESS(maps)->addr_space = NULL; RC_CHK_ACCESS(maps)->unwind_libunwind_ops = NULL; +#endif +#ifdef HAVE_LIBDW_SUPPORT + RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = NULL; #endif refcount_set(maps__refcnt(maps), 1); RC_CHK_ACCESS(maps)->nr_maps = 0; @@ -240,6 +258,9 @@ static void maps__exit(struct maps *maps) zfree(&maps_by_address); zfree(&maps_by_name); unwind__finish_access(maps); +#ifdef HAVE_LIBDW_SUPPORT + libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps)); +#endif } struct maps *maps__new(struct machine *machine) @@ -549,6 +570,9 @@ void maps__remove(struct maps *maps, struct map *map) __maps__remove(maps, map); check_invariants(maps); up_write(maps__lock(maps)); +#ifdef HAVE_LIBDW_SUPPORT + libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps)); +#endif } bool maps__empty(struct maps *maps) @@ -604,18 +628,26 @@ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data) void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data) { struct map **maps_by_address; + bool removed = false; down_write(maps__lock(maps)); maps_by_address = maps__maps_by_address(maps); for (unsigned int i = 0; i < maps__nr_maps(maps);) { - if (cb(maps_by_address[i], data)) + if (cb(maps_by_address[i], data)) { __maps__remove(maps, maps_by_address[i]); - else + removed = true; + } else { i++; + } } check_invariants(maps); up_write(maps__lock(maps)); + if (removed) { +#ifdef HAVE_LIBDW_SUPPORT + libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps)); +#endif + } } struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp) diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index d9aa62ed968a..20c52084ba9e 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -52,6 +52,10 @@ void maps__set_addr_space(struct maps *maps, void *addr_space); const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps); void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libunwind_ops *ops); #endif +#ifdef HAVE_LIBDW_SUPPORT +void *maps__libdw_addr_space_dwfl(const struct maps *maps); +void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl); +#endif size_t maps__fprintf(struct maps *maps, FILE *fp); diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index e0321043af88..c1646ef5f971 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -20,6 +20,17 @@ #include "callchain.h" #include "util/env.h" +/* + * The dwfl thread argument passed to functions like memory_read. Memory has to + * be allocated to persist of multiple uses of the dwfl. + */ +struct dwfl_ui_thread_info { + /* Back link to the dwfl. */ + Dwfl *dwfl; + /* The current unwind info, only 1 is supported. */ + struct unwind_info *ui; +}; + static char *debuginfo_path; static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata, @@ -35,6 +46,19 @@ static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata, return -1; } +void libdw__invalidate_dwfl(struct maps *maps, void *arg) +{ + struct dwfl_ui_thread_info *dwfl_ui_ti = arg; + + if (!dwfl_ui_ti) + return; + + assert(dwfl_ui_ti->ui == NULL); + maps__set_libdw_addr_space_dwfl(maps, NULL); + dwfl_end(dwfl_ui_ti->dwfl); + free(dwfl_ui_ti); +} + static const Dwfl_Callbacks offline_callbacks = { .find_debuginfo = __find_debuginfo, .debuginfo_path = &debuginfo_path, @@ -187,7 +211,8 @@ out_fail: static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *result, void *arg) { - struct unwind_info *ui = arg; + struct dwfl_ui_thread_info *dwfl_ui_ti = arg; + struct unwind_info *ui = dwfl_ui_ti->ui; uint16_t e_machine = thread__e_machine(ui->thread, ui->machine); struct stack_dump *stack = &ui->sample->user_stack; u64 start, end; @@ -228,7 +253,8 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg) { - struct unwind_info *ui = arg; + struct dwfl_ui_thread_info *dwfl_ui_ti = arg; + struct unwind_info *ui = dwfl_ui_ti->ui; struct regs_dump *user_regs = perf_sample__user_regs(ui->sample); Dwarf_Word *dwarf_regs; int max_dwarf_reg = 0; @@ -320,33 +346,50 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, int max_stack, bool best_effort) { - struct machine *machine = maps__machine(thread__maps(thread)); + struct maps *maps = thread__maps(thread); + struct machine *machine = maps__machine(maps); uint16_t e_machine = thread__e_machine(thread, machine); - struct unwind_info *ui, ui_buf = { - .sample = data, - .thread = thread, - .machine = machine, - .cb = cb, - .arg = arg, - .max_stack = max_stack, - .e_machine = e_machine, - .best_effort = best_effort - }; + struct dwfl_ui_thread_info *dwfl_ui_ti; + static struct unwind_info *ui; + Dwfl *dwfl; Dwarf_Word ip; int err = -EINVAL, i; if (!data->user_regs || !data->user_regs->regs) return -EINVAL; - ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack); + ui = zalloc(sizeof(*ui) + sizeof(ui->entries[0]) * max_stack); if (!ui) return -ENOMEM; - *ui = ui_buf; + *ui = (struct unwind_info){ + .sample = data, + .thread = thread, + .machine = machine, + .cb = cb, + .arg = arg, + .max_stack = max_stack, + .e_machine = e_machine, + .best_effort = best_effort + }; - ui->dwfl = dwfl_begin(&offline_callbacks); - if (!ui->dwfl) - goto out; + dwfl_ui_ti = maps__libdw_addr_space_dwfl(maps); + if (dwfl_ui_ti) { + dwfl = dwfl_ui_ti->dwfl; + } else { + dwfl_ui_ti = zalloc(sizeof(*dwfl_ui_ti)); + dwfl = dwfl_begin(&offline_callbacks); + if (!dwfl) + goto out; + + dwfl_ui_ti->dwfl = dwfl; + maps__set_libdw_addr_space_dwfl(maps, dwfl_ui_ti); + } + assert(dwfl_ui_ti->ui == NULL); + assert(dwfl_ui_ti->dwfl == dwfl); + assert(dwfl_ui_ti == maps__libdw_addr_space_dwfl(maps)); + dwfl_ui_ti->ui = ui; + ui->dwfl = dwfl; err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(e_machine)); if (err) @@ -356,11 +399,12 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (err) goto out; - err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks, ui); - if (err) - goto out; + dwfl_attach_state(dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks, + /* Dwfl thread function argument*/dwfl_ui_ti); + // Ignore thread already attached error. - err = dwfl_getthread_frames(ui->dwfl, thread__tid(thread), frame_callback, ui); + err = dwfl_getthread_frames(dwfl, thread__tid(thread), frame_callback, + /* Dwfl frame function argument*/ui); if (err && ui->max_stack != max_stack) err = 0; @@ -384,7 +428,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, for (i = 0; i < ui->idx; i++) map_symbol__exit(&ui->entries[i].ms); - dwfl_end(ui->dwfl); + dwfl_ui_ti->ui = NULL; free(ui); return 0; } diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 9c5b5fcaaae8..3dec0ab8bd50 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -2,15 +2,17 @@ #ifndef __PERF_UNWIND_LIBDW_H #define __PERF_UNWIND_LIBDW_H -#include +#include #include "unwind.h" struct machine; struct perf_sample; struct thread; +#ifdef HAVE_LIBDW_SUPPORT + struct unwind_info { - Dwfl *dwfl; + void *dwfl; struct perf_sample *sample; struct machine *machine; struct thread *thread; @@ -23,4 +25,7 @@ struct unwind_info { struct unwind_entry entries[]; }; +void libdw__invalidate_dwfl(struct maps *maps, void *dwfl); +#endif + #endif /* __PERF_UNWIND_LIBDW_H */ -- cgit v1.2.3 From 28cb835f7645892f4559b92fcfeb25a81646f4cf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Jan 2026 21:28:49 -0800 Subject: perf machine: Add inline information to frame pointer and LBR callchains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use append_inlines() in frame pointer and LBR cases. Update the addr2line test to also test frame pointers. LBR is also updated but inaccuracy in the branched to IP means the inline information is missing in the leaf. Leave LBR callchains untested for now. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Mark Wielaard Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/addr2line_inlines.sh | 31 +++++++-- tools/perf/util/machine.c | 104 ++++++++++++++++------------ 2 files changed, 86 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/addr2line_inlines.sh b/tools/perf/tests/shell/addr2line_inlines.sh index 4a5b6f5be23d..ce30d9c7e0bf 100755 --- a/tools/perf/tests/shell/addr2line_inlines.sh +++ b/tools/perf/tests/shell/addr2line_inlines.sh @@ -21,8 +21,28 @@ trap_cleanup() { } trap trap_cleanup EXIT TERM INT -test_inlinedloop() { - echo "Inline unwinding verification test" +test_fp() { + echo "Inline unwinding fp verification test" + # Record data. Currently only dwarf callchains support inlined functions. + perf record --call-graph fp -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1 + + # Check output with inline (default) and srcline + perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}" + + # Expect the leaf and middle functions to occur on lines in the 20s, with + # the non-inlined parent function on a line in the 30s. + if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" && + grep -q "inlineloop.c:3.$" "${perf_script_txt}" + then + echo "Inline unwinding fp verification test [Success]" + else + echo "Inline unwinding fp verification test [Failed missing inlined functions]" + err=1 + fi +} + +test_dwarf() { + echo "Inline unwinding dwarf verification test" # Record data. Currently only dwarf callchains support inlined functions. perf record --call-graph dwarf -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1 @@ -34,14 +54,15 @@ test_inlinedloop() { if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" && grep -q "inlineloop.c:3.$" "${perf_script_txt}" then - echo "Inline unwinding verification test [Success]" + echo "Inline unwinding dwarf verification test [Success]" else - echo "Inline unwinding verification test [Failed missing inlined functions]" + echo "Inline unwinding dwarf verification test [Failed missing inlined functions]" err=1 fi } -test_inlinedloop +test_fp +test_dwarf cleanup exit $err diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 841b711d970e..30d606fbf040 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2090,6 +2090,59 @@ struct iterations { u64 cycles; }; +static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip, + bool branch, struct branch_flags *flags, int nr_loop_iter, + u64 iter_cycles, u64 branch_from) +{ + struct symbol *sym = ms->sym; + struct map *map = ms->map; + struct inline_node *inline_node; + struct inline_list *ilist; + struct dso *dso; + u64 addr; + int ret = 1; + struct map_symbol ilist_ms; + bool first = true; + + if (!symbol_conf.inline_name || !map || !sym) + return ret; + + addr = map__dso_map_ip(map, ip); + addr = map__rip_2objdump(map, addr); + dso = map__dso(map); + + inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr); + if (!inline_node) { + inline_node = dso__parse_addr_inlines(dso, addr, sym); + if (!inline_node) + return ret; + inlines__tree_insert(dso__inlined_nodes(dso), inline_node); + } + + ilist_ms = (struct map_symbol) { + .maps = maps__get(ms->maps), + .map = map__get(map), + }; + list_for_each_entry(ilist, &inline_node->val, list) { + ilist_ms.sym = ilist->symbol; + if (first) { + ret = callchain_cursor_append(cursor, ip, &ilist_ms, + branch, flags, nr_loop_iter, + iter_cycles, branch_from, ilist->srcline); + } else { + ret = callchain_cursor_append(cursor, ip, &ilist_ms, false, + NULL, 0, 0, 0, ilist->srcline); + } + first = false; + + if (ret != 0) + return ret; + } + map_symbol__exit(&ilist_ms); + + return ret; +} + static int add_callchain_ip(struct thread *thread, struct callchain_cursor *cursor, struct symbol **parent, @@ -2170,6 +2223,11 @@ static int add_callchain_ip(struct thread *thread, ms.maps = maps__get(al.maps); ms.map = map__get(al.map); ms.sym = al.sym; + + if (append_inlines(cursor, &ms, ip, branch, flags, nr_loop_iter, + iter_cycles, branch_from) == 0) + goto out; + srcline = callchain_srcline(&ms, al.addr); err = callchain_cursor_append(cursor, ip, &ms, branch, flags, nr_loop_iter, @@ -2888,49 +2946,6 @@ check_calls: return 0; } -static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip) -{ - struct symbol *sym = ms->sym; - struct map *map = ms->map; - struct inline_node *inline_node; - struct inline_list *ilist; - struct dso *dso; - u64 addr; - int ret = 1; - struct map_symbol ilist_ms; - - if (!symbol_conf.inline_name || !map || !sym) - return ret; - - addr = map__dso_map_ip(map, ip); - addr = map__rip_2objdump(map, addr); - dso = map__dso(map); - - inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr); - if (!inline_node) { - inline_node = dso__parse_addr_inlines(dso, addr, sym); - if (!inline_node) - return ret; - inlines__tree_insert(dso__inlined_nodes(dso), inline_node); - } - - ilist_ms = (struct map_symbol) { - .maps = maps__get(ms->maps), - .map = map__get(map), - }; - list_for_each_entry(ilist, &inline_node->val, list) { - ilist_ms.sym = ilist->symbol; - ret = callchain_cursor_append(cursor, ip, &ilist_ms, false, - NULL, 0, 0, 0, ilist->srcline); - - if (ret != 0) - return ret; - } - map_symbol__exit(&ilist_ms); - - return ret; -} - static int unwind_entry(struct unwind_entry *entry, void *arg) { struct callchain_cursor *cursor = arg; @@ -2940,7 +2955,8 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) if (symbol_conf.hide_unresolved && entry->ms.sym == NULL) return 0; - if (append_inlines(cursor, &entry->ms, entry->ip) == 0) + if (append_inlines(cursor, &entry->ms, entry->ip, /*branch=*/false, /*branch_flags=*/NULL, + /*nr_loop_iter=*/0, /*iter_cycles=*/0, /*branch_from=*/0) == 0) return 0; /* -- cgit v1.2.3 From 2e6690d4f7fc41c4fae7d0a4c0bf11f1973e5650 Mon Sep 17 00:00:00 2001 From: Gyutae Bae Date: Tue, 20 Jan 2026 18:07:16 +0900 Subject: selftests/bpf: Add perfbuf multi-producer benchmark Add a multi-producer benchmark for perfbuf to complement the existing ringbuf multi-producer test. Unlike ringbuf which uses a shared buffer and experiences contention, perfbuf uses per-CPU buffers so the test measures scaling behavior rather than contention. This allows developers to compare perfbuf vs ringbuf performance under multi-producer workloads when choosing between the two for their systems. Signed-off-by: Gyutae Bae Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260120090716.82927-1-gyutae.opensource@navercorp.com --- tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh index 83e05e837871..123b7feb6935 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -49,6 +49,11 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" done +header "Perfbuf, multi-producer" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "pb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 --rb-sample-rate 50 pb-libbpf)" +done + header "Ringbuf, multi-producer contention in overwrite mode, no consumer" for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)" -- cgit v1.2.3 From 965ef09a26f3efe7dcdc4691860c1e85da353b77 Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Fri, 26 Dec 2025 08:54:32 +0100 Subject: tools: lib: thermal: Correct CFLAGS and LDFLAGS in pkg-config template There are two issues with the current pkg-config template. Firstly, the -lthermal linker flag is missing. Secondly, the libnl3 include directory compiler flag references "include" instead of "includedir", which leads to an unexpanded variable when pkg-config is called. Add the missing -lthermal flag and correct the libnl3 include directory. Signed-off-by: Romain Gantois Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20251226-libthermal-pkgconfig-v1-1-3406de5ca8ea@bootlin.com --- tools/lib/thermal/libthermal.pc.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/thermal/libthermal.pc.template b/tools/lib/thermal/libthermal.pc.template index ac24d0ab17f5..3b8a24d0a8b8 100644 --- a/tools/lib/thermal/libthermal.pc.template +++ b/tools/lib/thermal/libthermal.pc.template @@ -8,5 +8,5 @@ Name: libthermal Description: thermal library Requires: libnl-3.0 libnl-genl-3.0 Version: @VERSION@ -Libs: -L${libdir} -lnl-genl-3 -lnl-3 -Cflags: -I${includedir} -I${include}/libnl3 +Libs: -L${libdir} -lnl-genl-3 -lnl-3 -lthermal +Cflags: -I${includedir} -I${includedir}/libnl3 -- cgit v1.2.3 From 92ea788d2af4e65ad7a144ccfff50667e9a0d227 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 13 Jan 2026 15:29:02 -0800 Subject: perf inject: Add --convert-callchain option There are applications not built with frame pointers, so DWARF is needed to get the stack traces. `perf record --call-graph dwarf` saves the stack and register data for each sample to get the stacktrace offline. But sometimes this data may have sensitive information and we don't want to keep them in the file. This new 'perf inject --convert-callchain' option creates the callchains and discards the stack and register after that. This saves storage space and processing time for the new data file. Of course, users should remove the original data file to not keep sensitive data around. :) The down side is that it cannot handle inlined callchain entries as they all have the same IPs. Maybe we can add an option to 'perf report' to look up inlined functions using DWARF - IIUC it doesn't require stack and register data. This is an example. $ perf record --call-graph dwarf -- perf test -w noploop $ perf report --stdio --no-children --percent-limit=0 > output-prev $ perf inject -i perf.data --convert-callchain -o perf.data.out $ perf report --stdio --no-children --percent-limit=0 -i perf.data.out > output-next $ diff -u output-prev output-next ... 0.23% perf ld-linux-x86-64.so.2 [.] _dl_relocate_object_no_relro | - ---elf_dynamic_do_Rela (inlined) - _dl_relocate_object_no_relro + ---_dl_relocate_object_no_relro _dl_relocate_object dl_main _dl_sysdep_start - _dl_start_final (inlined) _dl_start _start Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-inject.txt | 5 + tools/perf/builtin-inject.c | 152 +++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index c972032f4ca0..95dfdf39666e 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -109,6 +109,11 @@ include::itrace.txt[] should be used, and also --buildid-all and --switch-events may be useful. +--convert-callchain:: + Parse DWARF callchains and convert them to usual callchains. This also + discards stack and register data from the samples. This will lose + inlined callchain entries. + :GMEXAMPLECMD: inject :GMEXAMPLESUBCMD: include::guestmount.txt[] diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 6080afec537d..e2a653280e1b 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -122,6 +122,7 @@ struct perf_inject { bool in_place_update; bool in_place_update_dry_run; bool copy_kcore_dir; + bool convert_callchain; const char *input_name; struct perf_data output; u64 bytes_written; @@ -133,6 +134,7 @@ struct perf_inject { struct guest_session guest_session; struct strlist *known_build_ids; const struct evsel *mmap_evsel; + struct ip_callchain *raw_callchain; }; struct event_entry { @@ -383,6 +385,90 @@ static int perf_event__repipe_sample(const struct perf_tool *tool, return perf_event__repipe_synth(tool, event); } +static int perf_event__convert_sample_callchain(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct evsel *evsel, + struct machine *machine) +{ + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + union perf_event *event_copy = (void *)inject->event_copy; + struct callchain_cursor_node *node; + struct thread *thread; + u64 sample_type = evsel->core.attr.sample_type; + u32 sample_size = event->header.size; + u64 i, k; + int ret; + + if (event_copy == NULL) { + inject->event_copy = malloc(PERF_SAMPLE_MAX_SIZE); + if (!inject->event_copy) + return -ENOMEM; + + event_copy = (void *)inject->event_copy; + } + + if (cursor == NULL) + return -ENOMEM; + + callchain_cursor_reset(cursor); + + thread = machine__find_thread(machine, sample->tid, sample->pid); + if (thread == NULL) + goto out; + + /* this will parse DWARF using stack and register data */ + ret = thread__resolve_callchain(thread, cursor, evsel, sample, + /*parent=*/NULL, /*root_al=*/NULL, + PERF_MAX_STACK_DEPTH); + thread__put(thread); + if (ret != 0) + goto out; + + /* copy kernel callchain and context entries */ + for (i = 0; i < sample->callchain->nr; i++) { + inject->raw_callchain->ips[i] = sample->callchain->ips[i]; + if (sample->callchain->ips[i] == PERF_CONTEXT_USER) { + i++; + break; + } + } + if (i == 0 || inject->raw_callchain->ips[i - 1] != PERF_CONTEXT_USER) + inject->raw_callchain->ips[i++] = PERF_CONTEXT_USER; + + node = cursor->first; + for (k = 0; k < cursor->nr && i < PERF_MAX_STACK_DEPTH; k++) { + if (machine__kernel_ip(machine, node->ip)) + /* kernel IPs were added already */; + else if (node->ms.sym && node->ms.sym->inlined) + /* we can't handle inlined callchains */; + else + inject->raw_callchain->ips[i++] = node->ip; + + node = node->next; + } + + inject->raw_callchain->nr = i; + sample->callchain = inject->raw_callchain; + +out: + memcpy(event_copy, event, sizeof(event->header)); + + /* adjust sample size for stack and regs */ + sample_size -= sample->user_stack.size; + sample_size -= (hweight64(evsel->core.attr.sample_regs_user) + 1) * sizeof(u64); + sample_size += (sample->callchain->nr + 1) * sizeof(u64); + event_copy->header.size = sample_size; + + /* remove sample_type {STACK,REGS}_USER for synthesize */ + sample_type &= ~(PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER); + + perf_event__synthesize_sample(event_copy, sample_type, + evsel->core.attr.read_format, sample); + return perf_event__repipe_synth(tool, event_copy); +} + static struct dso *findnew_dso(int pid, int tid, const char *filename, const struct dso_id *id, struct machine *machine) { @@ -2270,6 +2356,15 @@ static int __cmd_inject(struct perf_inject *inject) /* Allow space in the header for guest attributes */ output_data_offset += gs->session->header.data_offset; output_data_offset = roundup(output_data_offset, 4096); + } else if (inject->convert_callchain) { + inject->tool.sample = perf_event__convert_sample_callchain; + inject->tool.fork = perf_event__repipe_fork; + inject->tool.comm = perf_event__repipe_comm; + inject->tool.exit = perf_event__repipe_exit; + inject->tool.mmap = perf_event__repipe_mmap; + inject->tool.mmap2 = perf_event__repipe_mmap2; + inject->tool.ordered_events = true; + inject->tool.ordering_requires_timestamps = true; } if (!inject->itrace_synth_opts.set) @@ -2322,6 +2417,23 @@ static int __cmd_inject(struct perf_inject *inject) perf_header__set_feat(&session->header, HEADER_BRANCH_STACK); } + + /* + * The converted data file won't have stack and registers. + * Update the perf_event_attr to remove them before writing. + */ + if (inject->convert_callchain) { + struct evsel *evsel; + + evlist__for_each_entry(session->evlist, evsel) { + evsel__reset_sample_bit(evsel, REGS_USER); + evsel__reset_sample_bit(evsel, STACK_USER); + evsel->core.attr.sample_regs_user = 0; + evsel->core.attr.sample_stack_user = 0; + evsel->core.attr.exclude_callchain_user = 0; + } + } + session->header.data_offset = output_data_offset; session->header.data_size = inject->bytes_written; perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc, @@ -2346,6 +2458,18 @@ static int __cmd_inject(struct perf_inject *inject) return ret; } +static bool evsel__has_dwarf_callchain(struct evsel *evsel) +{ + struct perf_event_attr *attr = &evsel->core.attr; + const u64 dwarf_callchain_flags = + PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN; + + if (!attr->exclude_callchain_user) + return false; + + return (attr->sample_type & dwarf_callchain_flags) == dwarf_callchain_flags; +} + int cmd_inject(int argc, const char **argv) { struct perf_inject inject = { @@ -2414,6 +2538,8 @@ int cmd_inject(int argc, const char **argv) OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory", "guest mount directory under which every guest os" " instance has a subdir"), + OPT_BOOLEAN(0, "convert-callchain", &inject.convert_callchain, + "Generate callchains using DWARF and drop register/stack data"), OPT_END() }; const char * const inject_usage[] = { @@ -2429,6 +2555,9 @@ int cmd_inject(int argc, const char **argv) #ifndef HAVE_JITDUMP set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true); +#endif +#ifndef HAVE_LIBDW_SUPPORT + set_option_nobuild(options, 0, "convert-callchain", "NO_LIBDW=1", true); #endif argc = parse_options(argc, argv, options, inject_usage, 0); @@ -2588,6 +2717,28 @@ int cmd_inject(int argc, const char **argv) } } + if (inject.convert_callchain) { + struct evsel *evsel; + + if (inject.output.is_pipe || inject.session->data->is_pipe) { + pr_err("--convert-callchain cannot work with pipe\n"); + goto out_delete; + } + + evlist__for_each_entry(inject.session->evlist, evsel) { + if (!evsel__has_dwarf_callchain(evsel)) { + pr_err("--convert-callchain requires DWARF call graph.\n"); + goto out_delete; + } + } + + inject.raw_callchain = calloc(PERF_MAX_STACK_DEPTH, sizeof(u64)); + if (inject.raw_callchain == NULL) { + pr_err("callchain allocation failed\n"); + goto out_delete; + } + } + #ifdef HAVE_JITDUMP if (inject.jit_mode) { inject.tool.mmap2 = perf_event__repipe_mmap2; @@ -2618,5 +2769,6 @@ out_close_output: free(inject.itrace_synth_opts.vm_tm_corr_args); free(inject.event_copy); free(inject.guest_session.ev.event_buf); + free(inject.raw_callchain); return ret; } -- cgit v1.2.3 From b42c4dfe02af407d04375f070f56a818ed4c92ce Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 13 Jan 2026 15:29:03 -0800 Subject: perf test: Add DWARF callchain conversion test $ perf test -vv "DWARF callchain" 87: perf inject to convert DWARF callchains to regular ones: --- start --- test child forked, pid 1560328 recording data with DWARF callchain [ perf record: Woken up 4 times to write data ] [ perf record: Captured and wrote 0.908 MB /tmp/perf-test.nM3WoW (105 samples) ] convert DWARF callchain using perf inject compare the both result excluding inlined functions ---- end(0) ---- 87: perf inject to convert DWARF callchains to regular ones : Ok $ Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/inject-callchain.sh | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100755 tools/perf/tests/shell/inject-callchain.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/inject-callchain.sh b/tools/perf/tests/shell/inject-callchain.sh new file mode 100755 index 000000000000..a1cba8010f95 --- /dev/null +++ b/tools/perf/tests/shell/inject-callchain.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# perf inject to convert DWARF callchains to regular ones +# SPDX-License-Identifier: GPL-2.0 + +if ! perf check feature -q dwarf; then + echo "SKIP: DWARF support is not available" + exit 2 +fi + +TESTDATA=$(mktemp /tmp/perf-test.XXXXXX) + +err=0 + +cleanup() +{ + trap - EXIT TERM INT + rm -f ${TESTDATA}* +} + +trap_cleanup() +{ + cleanup + exit 1 +} + +trap trap_cleanup EXIT TERM INT + +echo "recording data with DWARF callchain" +perf record -F 999 --call-graph dwarf -o "${TESTDATA}" -- perf test -w noploop + +echo "convert DWARF callchain using perf inject" +perf inject -i "${TESTDATA}" --convert-callchain -o "${TESTDATA}.new" + +perf report -i "${TESTDATA}" --no-children -q --percent-limit=1 > ${TESTDATA}.out +perf report -i "${TESTDATA}.new" --no-children -q --percent-limit=1 > ${TESTDATA}.new.out + +echo "compare the both result excluding inlined functions" +if diff -u "${TESTDATA}.out" "${TESTDATA}.new.out" | grep "^- " | grep -qv "(inlined)"; then + echo "Found some differences" + diff -u "${TESTDATA}.out" "${TESTDATA}.new.out" + err=1 +fi + +cleanup +exit $err -- cgit v1.2.3 From 069e603d8248dac98b1ef2909e2f1c4169b9da11 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 13 Jan 2026 15:37:57 -0800 Subject: perf tools: Get debug info of DSO properly The dso__debuginfo() just used the path name to open the file but it may be outdated. It should check build-ID and use the file in the build-ID cache if available rather than just using the path name. Let's factor out dso__get_filename() to avoid code duplicate. Fixes: 53a61a6ca279165d ("perf annotate: Add dso__debuginfo() helper") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 63 +++++++++++++++++++++++++++++++++++++++------------ tools/perf/util/dso.h | 11 ++------- 2 files changed, 50 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index dce207c7f862..3b272a6fae24 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -112,7 +112,7 @@ bool dso__is_object_file(const struct dso *dso) int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type, - char *root_dir, char *filename, size_t size) + const char *root_dir, char *filename, size_t size) { char build_id_hex[SBUILD_ID_SIZE]; int ret = 0; @@ -561,20 +561,15 @@ char *dso__filename_with_chroot(const struct dso *dso, const char *filename) return filename_with_chroot(nsinfo__pid(dso__nsinfo_const(dso)), filename); } -static int __open_dso(struct dso *dso, struct machine *machine) - EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock) +static char *dso__get_filename(struct dso *dso, const char *root_dir, + bool *decomp) { - int fd = -EINVAL; - char *root_dir = (char *)""; char *name = malloc(PATH_MAX); - bool decomp = false; - if (!name) - return -ENOMEM; + *decomp = false; - mutex_lock(dso__lock(dso)); - if (machine) - root_dir = machine->root_dir; + if (name == NULL) + return NULL; if (dso__read_binary_type_filename(dso, dso__binary_type(dso), root_dir, name, PATH_MAX)) @@ -599,20 +594,38 @@ static int __open_dso(struct dso *dso, struct machine *machine) size_t len = sizeof(newpath); if (dso__decompress_kmodule_path(dso, name, newpath, len) < 0) { - fd = -(*dso__load_errno(dso)); + errno = *dso__load_errno(dso); goto out; } - decomp = true; + *decomp = true; strcpy(name, newpath); } + return name; + +out: + free(name); + return NULL; +} - fd = do_open(name); +static int __open_dso(struct dso *dso, struct machine *machine) + EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock) +{ + int fd = -EINVAL; + char *name; + bool decomp = false; + + mutex_lock(dso__lock(dso)); + + name = dso__get_filename(dso, machine ? machine->root_dir : "", &decomp); + if (name) + fd = do_open(name); + else + fd = -errno; if (decomp) unlink(name); -out: mutex_unlock(dso__lock(dso)); free(name); return fd; @@ -1916,3 +1929,23 @@ const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename, return __dso__read_symbol(dso, symfs_filename, start, len, out_buf, out_buf_len, is_64bit); } + +struct debuginfo *dso__debuginfo(struct dso *dso) +{ + char *name; + bool decomp = false; + struct debuginfo *dinfo = NULL; + + mutex_lock(dso__lock(dso)); + + name = dso__get_filename(dso, "", &decomp); + if (name) + dinfo = debuginfo__new(name); + + if (decomp) + unlink(name); + + mutex_unlock(dso__lock(dso)); + free(name); + return dinfo; +} diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 295388085031..ac725bc8ea74 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -784,7 +784,7 @@ int dso__kernel_module_get_build_id(struct dso *dso, const char *root_dir); char dso__symtab_origin(const struct dso *dso); int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type, - char *root_dir, char *filename, size_t size); + const char *root_dir, char *filename, size_t size); bool is_kernel_module(const char *pathname, int cpumode); bool dso__needs_decompress(struct dso *dso); int dso__decompress_kmodule_fd(struct dso *dso, const char *name); @@ -933,14 +933,7 @@ u64 dso__findnew_global_type(struct dso *dso, u64 addr, u64 offset); bool perf_pid_map_tid(const char *dso_name, int *tid); bool is_perf_pid_map_name(const char *dso_name); -/* - * In the future, we may get debuginfo using build-ID (w/o path). - * Add this helper is for the smooth conversion. - */ -static inline struct debuginfo *dso__debuginfo(struct dso *dso) -{ - return debuginfo__new(dso__long_name(dso)); -} +struct debuginfo *dso__debuginfo(struct dso *dso); const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename, const struct map *map, const struct symbol *sym, -- cgit v1.2.3 From 4906eccbfae3eec58f7bf27f2921dc365bedafec Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Mon, 8 Dec 2025 10:15:14 +0800 Subject: perf tools: Add the legacy-cache.json to .gitignore The commit 0012e0fa221bf9cc ("perf jevents: Add legacy-hardware and legacy-cache json") will auto-generate: "pmu-events/arch/common/common/legacy-cache.json". Reviewed-by: James Clark Signed-off-by: Haiyue Wang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index b64302a76144..5c59f954f52a 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -36,6 +36,7 @@ config.mak.autogen util/intel-pt-decoder/inat-tables.c arch/*/include/generated/ trace/beauty/generated/ +pmu-events/arch/common/common/legacy-cache.json pmu-events/pmu-events.c pmu-events/jevents pmu-events/metric_test.log -- cgit v1.2.3 From a58807adbed5f532efb231e5490767f284f237c0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 10 Dec 2025 11:01:41 -0800 Subject: perf tests kallsyms: Fix missed map__put() Issue was caught by leak sanitizer and the test robot. Fixes: 34e271ae55382fbd ("perf test: Add kallsyms split test") Reported-by: kernel test robot Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Closes: https://lore.kernel.org/oe-lkp/202512101502.f3819cd3-lkp@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/kallsyms-split.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/tests/kallsyms-split.c b/tools/perf/tests/kallsyms-split.c index bbbc66957e5d..117ed3b70f63 100644 --- a/tools/perf/tests/kallsyms-split.c +++ b/tools/perf/tests/kallsyms-split.c @@ -148,6 +148,7 @@ static int test__kallsyms_split(struct test_suite *test __maybe_unused, ret = TEST_OK; out: + map__put(map); remove_proc_dir(0); machine__exit(&m); return ret; -- cgit v1.2.3 From e524dda49340cb973c95fdfd6aa700eeb67aa128 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 12 Dec 2025 17:24:59 +0000 Subject: perf mem: Simplify Arm SPE event config Since configuration fields default to zero, the zero assignments are redundant, remove them. Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Al Grant Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/mem-events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c index 9f8da7937255..eaf00e0609c6 100644 --- a/tools/perf/arch/arm64/util/mem-events.c +++ b/tools/perf/arch/arm64/util/mem-events.c @@ -6,7 +6,7 @@ #define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a } struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = { - E("spe-load", "%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/", NULL, true, 0), - E("spe-store", "%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/", NULL, false, 0), + E("spe-load", "%s/ts_enable=1,pa_enable=1,load_filter=1,min_latency=%u/", NULL, true, 0), + E("spe-store", "%s/ts_enable=1,pa_enable=1,store_filter=1/", NULL, false, 0), E("spe-ldst", "%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/", NULL, true, 0), }; -- cgit v1.2.3 From dc7fb075f7de33ee78a2133598215af6a87d7ab3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 12 Dec 2025 17:25:00 +0000 Subject: perf c2c: Update documentation for adding memory event table Users may occasionally need to see which options are applied to memory events. This helps to understand the behavior of "perf c2c" and "perf mem", and provides guidance for configuring memory event options directly. Add a table to track memory events and their corresponding options, and include the Arm SPE events in it. Suggested-by: Al Grant Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-c2c.txt | 51 +++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index 40b0f71a2c44..e57a122b8719 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -160,20 +160,43 @@ Following perf record options are configured by default: -W,-d,--phys-data,--sample-cpu -Unless specified otherwise with '-e' option, following events are monitored by -default on Intel: - - cpu/mem-loads,ldlat=30/P - cpu/mem-stores/P - -following on AMD: - - ibs_op// - -and following on PowerPC: - - cpu/mem-loads/ - cpu/mem-stores/ +The following table lists the events monitored on different architectures. +Unless specified otherwise with the -e option, the tool will select the +default events. + + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ + | Arch | Configuration | Options | Events | + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ + | Intel | Default | -e ldlat-loads | cpu/mem-loads,ldlat=30/P | + | | | -e ldlat-stores | cpu/mem-stores/P | + | |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Load only | -e ldlat-loads | cpu/mem-loads,ldlat=30/P | + | |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Store only | -e ldlat-stores | cpu/mem-stores/P | + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ + | Intel | Default | -e ldlat-loads | {cpu/mem-loads-aux/,cpu/mem-loads,ldlat=30/}:P | + | with | | -e ldlat-stores | cpu/mem-stores/P | + | AUX |--------------+------------------+--------------------------------------------------------------------------------+ + | | Load only | -e ldlat-loads | {cpu/mem-loads-aux/,cpu/mem-loads,ldlat=30/}:P | + | |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Store only | -e ldlat-stores | cpu/mem-stores/P | + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ + | AMD | Default | -e mem-ldst | ibs_op// (without latency support) | + | | | | ibs_op/ldlat=30/ (with latency support) | + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ + | PowerPC| Default | -e ldlat-loads | cpu/mem-loads/ | + | | | -e ldlat-stores | cpu/mem-stores/ | + | |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Load only | -e ldlat-loads | cpu/mem-loads/ | + | |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Store only | -e ldlat-stores | cpu/mem-stores/ | + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ + | Arm | Default | -e spe-ldst | arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=30/ | + | SPE |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Load only | -e spe-load | arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,min_latency=30/ | + | |---------------+-----------------+--------------------------------------------------------------------------------+ + | | Store only | -e spe-store | arm_spe_0/ts_enable=1,pa_enable=1,store_filter=1/ | + +--------+---------------+-----------------+--------------------------------------------------------------------------------+ User can pass any 'perf record' option behind '--' mark, like (to enable callchains and system wide monitoring): -- cgit v1.2.3 From 9bb93278c35d658057f513ba25ffb2b5204c2b9e Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 4 Dec 2025 09:10:53 +0000 Subject: perf tools: Always uniquify event names evlist__uniquify_evsel_names() only gets called in __parse_events() if verbose is > 0. This means that the auto added "slots" events stay as "slots" rather than being expanded to "cpu_core/slots/" unless Perf is run in verbose mode. This is invisible to users when running Perf stat because evlist__print_counters() always calls it regardless of verbose mode before displaying. The only thing this seems to affect is the test "Parsing of all PMU events from sysfs" which fails when not run in verbose mode. test__checkevent_pmu_events() always expects event names to be prefixed with the pmu name, but this only happens for "slots" events after evlist__uniquify_evsel_names() is called. One fix could be to relax the test to accept the non prefixed name in normal mode. But seeing as Perf stat uniquifies unconditionally, make parse_events() do the same. This fixes the following test failure: $ perf test "Parsing of all PMU events from sysfs" 5.2: Parsing of all PMU events from sysfs : FAILED! $ Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c8f2962a06c7..6e1185d7be1b 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2247,12 +2247,12 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte evlist__splice_list_tail(evlist, &parse_state.list); if (ret2 && warn_if_reordered && !parse_state.wild_card_pmus) { + evlist__uniquify_evsel_names(evlist, &stat_config); pr_warning("WARNING: events were regrouped to match PMUs\n"); if (verbose > 0) { struct strbuf sb = STRBUF_INIT; - evlist__uniquify_evsel_names(evlist, &stat_config); evlist__format_evsels(evlist, &sb, 2048); pr_debug("evlist after sorting/fixing: '%s'\n", sb.buf); strbuf_release(&sb); -- cgit v1.2.3 From 838def24130540dcb7b846bdb2bad63ea4c3dd55 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 4 Dec 2025 09:10:54 +0000 Subject: perf test: Add missing newlines in debug messages These debug messages bleed into the next log line. Fix it by adding the missing newlines. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 2bd622972114..1d3cc224fbc2 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2627,7 +2627,7 @@ static int test_events(const struct evlist_test *events, int cnt) pr_debug("running test %d '%s'\n", i, e.name); test_ret = test_event(&e); if (test_ret != TEST_OK) { - pr_debug("Event test failure: test %d '%s'", i, e.name); + pr_debug("Event test failure: test %d '%s'\n", i, e.name); ret = combine_test_results(ret, test_ret); } } @@ -2764,7 +2764,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest test_ret = test_event(&e); if (test_ret != TEST_OK) { - pr_debug("Test PMU event failed for '%s'", name); + pr_debug("Test PMU event failed for '%s'\n", name); ret = combine_test_results(ret, test_ret); } @@ -2790,7 +2790,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest e.check = test__checkevent_pmu_events_mix; test_ret = test_event(&e); if (test_ret != TEST_OK) { - pr_debug("Test PMU event failed for '%s'", name); + pr_debug("Test PMU event failed for '%s'\n", name); ret = combine_test_results(ret, test_ret); } } -- cgit v1.2.3 From a70493e2bb0878885aa7a8178162550270693eb1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 19 Jan 2026 10:18:35 +0000 Subject: perf cs-etm: Fix decoding for sparse CPU maps The ETM decoder incorrectly assumed that auxtrace queue indices were equivalent to CPU number. This assumption is used for inserting records into the queue, and for fetching queues when given a CPU number. This assumption held when Perf always opened a dummy event on every CPU, even if the user provided a subset of CPUs on the commandline, resulting in the indices aligning. For example: # event : name = cs_etm//u, , id = { 2451, 2452 }, type = 11 (cs_etm), size = 136, config = 0x4010, { sample_period, samp> # event : name = dummy:u, , id = { 2453, 2454, 2455, 2456 }, type = 1 (PERF_TYPE_SOFTWARE), size = 136, config = 0x9 (PER> 0 0 0x200 [0xd0]: PERF_RECORD_ID_INDEX nr: 6 ... id: 2451 idx: 2 cpu: 2 tid: -1 ... id: 2452 idx: 3 cpu: 3 tid: -1 ... id: 2453 idx: 0 cpu: 0 tid: -1 ... id: 2454 idx: 1 cpu: 1 tid: -1 ... id: 2455 idx: 2 cpu: 2 tid: -1 ... id: 2456 idx: 3 cpu: 3 tid: -1 Since commit 811082e4b668 ("perf parse-events: Support user CPUs mixed with threads/processes") the dummy event no longer behaves in this way, making the ETM event indices start from 0 on the first CPU recorded regardless of its ID: # event : name = cs_etm//u, , id = { 771, 772 }, type = 11 (cs_etm), size = 144, config = 0x4010, { sample_period, sample> # event : name = dummy:u, , id = { 773, 774 }, type = 1 (PERF_TYPE_SOFTWARE), size = 144, config = 0x9 (PERF_COUNT_SW_DUM> 0 0 0x200 [0x90]: PERF_RECORD_ID_INDEX nr: 4 ... id: 771 idx: 0 cpu: 2 tid: -1 ... id: 772 idx: 1 cpu: 3 tid: -1 ... id: 773 idx: 0 cpu: 2 tid: -1 ... id: 774 idx: 1 cpu: 3 tid: -1 This causes the following segfault when decoding: $ perf record -e cs_etm//u -C 2,3 -- true $ perf report perf: Segmentation fault -------- backtrace -------- #0 0xaaaabf9fd020 in ui__signal_backtrace setup.c:110 #1 0xffffab5c7930 in __kernel_rt_sigreturn [vdso][930] #2 0xaaaabfb68d30 in cs_etm_decoder__reset cs-etm-decoder.c:85 #3 0xaaaabfb65930 in cs_etm__get_data_block cs-etm.c:2032 #4 0xaaaabfb666fc in cs_etm__run_per_cpu_timeless_decoder cs-etm.c:2551 #5 0xaaaabfb6692c in (cs_etm__process_timeless_queues cs-etm.c:2612 #6 0xaaaabfb63390 in cs_etm__flush_events cs-etm.c:921 #7 0xaaaabfb324c0 in auxtrace__flush_events auxtrace.c:2915 #8 0xaaaabfaac378 in __perf_session__process_events session.c:2285 #9 0xaaaabfaacc9c in perf_session__process_events session.c:2442 #10 0xaaaabf8d3d90 in __cmd_report builtin-report.c:1085 #11 0xaaaabf8d6944 in cmd_report builtin-report.c:1866 #12 0xaaaabf95ebfc in run_builtin perf.c:351 #13 0xaaaabf95eeb0 in handle_internal_command perf.c:404 #14 0xaaaabf95f068 in run_argv perf.c:451 #15 0xaaaabf95f390 in main perf.c:558 #16 0xffffaab97400 in __libc_start_call_main libc_start_call_main.h:74 #17 0xffffaab974d8 in __libc_start_main@@GLIBC_2.34 libc-start.c:128 #18 0xaaaabf8aa8f0 in _start perf[7a8f0] Fix it by inserting into the queues based on CPU number, rather than using the index. Fixes: 811082e4b668db96 ("perf parse-events: Support user CPUs mixed with threads/processes") Signed-off-by: James Clark Tested-by: Leo Yan Cc: Adrian Hunter Cc: Alexander Shishkin Cc: coresight@lists.linaro.org Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 25d56e0f1c07..12b55c2bc2ca 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -3086,7 +3086,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o if (aux_offset >= auxtrace_event->offset && aux_offset + aux_size <= auxtrace_event->offset + auxtrace_event->size) { - struct cs_etm_queue *etmq = etm->queues.queue_array[auxtrace_event->idx].priv; + struct cs_etm_queue *etmq = cs_etm__get_queue(etm, auxtrace_event->cpu); /* * If this AUX event was inside this buffer somewhere, create a new auxtrace event @@ -3095,6 +3095,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o auxtrace_fragment.auxtrace = *auxtrace_event; auxtrace_fragment.auxtrace.size = aux_size; auxtrace_fragment.auxtrace.offset = aux_offset; + auxtrace_fragment.auxtrace.idx = etmq->queue_nr; file_offset += aux_offset - auxtrace_event->offset + auxtrace_event->header.size; pr_debug3("CS ETM: Queue buffer size: %#"PRI_lx64" offset: %#"PRI_lx64 -- cgit v1.2.3 From 3d020f2e3baea49f68c71f73ebb947da8e6fedc5 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 19 Jan 2026 10:18:36 +0000 Subject: perf cs-etm: Test sparse CPU maps We only currently test with default (all CPUs) or --per-thread mode. Different permutations of the "-C" option can affect decoding so add tests for some of them. Signed-off-by: James Clark Tested-by: Leo Yan Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_coresight.sh | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh index 1c750b67d141..bbf89e944e7b 100755 --- a/tools/perf/tests/shell/test_arm_coresight.sh +++ b/tools/perf/tests/shell/test_arm_coresight.sh @@ -198,6 +198,58 @@ arm_cs_etm_basic_test() { arm_cs_report "CoreSight basic testing with '$*'" $err } +arm_cs_etm_test_cpu_list() { + echo "Testing sparse CPU list: $1" + perf record -o ${perfdata} -e cs_etm//u -C $1 \ + -- taskset --cpu-list $1 true > /dev/null 2>&1 + perf_script_branch_samples true + err=$? + arm_cs_report "CoreSight sparse CPUs with '$*'" $err +} + +arm_cs_etm_sparse_cpus_test() { + # Iterate for every ETM device + cpus=() + for dev in /sys/bus/event_source/devices/cs_etm/cpu*; do + # Canonicalize the path + dev=`readlink -f $dev` + + # Find the ETM device belonging to which CPU + cpus+=("$(cat $dev/cpu)") + done + + mapfile -t cpus < <(printf '%s\n' "${cpus[@]}" | sort -n) + total=${#cpus[@]} + + # Need more than 1 to test + if [ $total -le 1 ]; then + return 0 + fi + + half=$((total / 2)) + + # First half + first_half=$(IFS=,; echo "${cpus[*]:0:$half}") + arm_cs_etm_test_cpu_list $first_half + + # Second half + second_half=$(IFS=,; echo "${cpus[*]:$half}") + arm_cs_etm_test_cpu_list $second_half + + # Odd list is the same as halves unless >= 4 CPUs + if [ $total -lt 4 ]; then + return 0 + fi + + # Odd indices + odd_cpus=() + for ((i=1; i Date: Mon, 19 Jan 2026 13:04:56 +0100 Subject: docs: kdoc: fix logic to handle unissued warnings Changeset 469c1c9eb6c9 ("kernel-doc: Issue warnings that were silently discarded") didn't properly addressed the missing messages behavior, as it was calling directly python logger low-level function, instead of using the expected method to emit warnings. Basically, there are two methods to log messages: - self.config.log.warning() - This is the raw level to emit a warning. It just writes the a message at stderr, via python logging, as it is initialized as: self.config.log = logging.getLogger("kernel-doc") - self.config.warning() - This is where we actually consider a message as a warning, properly incrementing error count. Due to that, several parsing error messages are internally considered as success, causing -Werror to not work on such messages. While here, ensure that the last ignored entry will also be handled by adding an extra check at the end of the parse handler. Fixes: 469c1c9eb6c9 ("kernel-doc: Issue warnings that were silently discarded") Closes: https://lore.kernel.org/linux-doc/20260112091053.00cee29a@foz.lan/ Cc: stable@vger.kernel.org Signed-off-by: Mauro Carvalho Chehab Acked-by: Andy Shevchenko Signed-off-by: Jonathan Corbet Message-ID: <95109a6585171da4d6900049deaa2634b41ee743.1768823489.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/kdoc_parser.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index a9a37519145d..c03505889dc2 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -295,7 +295,7 @@ class KernelEntry: # TODO: rename to emit_message after removal of kernel-doc.pl def emit_msg(self, ln, msg, *, warning=True): - """Emit a message""" + """Emit a message.""" log_msg = f"{self.fname}:{ln} {msg}" @@ -448,18 +448,37 @@ class KernelDoc: self.config.log.debug("Output: %s:%s = %s", dtype, name, pformat(args)) + def emit_unused_warnings(self): + """ + When the parser fails to produce a valid entry, it places some + warnings under `entry.warnings` that will be discarded when resetting + the state. + + Ensure that those warnings are not lost. + + .. note:: + + Because we are calling `config.warning()` here, those + warnings are not filtered by the `-W` parameters: they will all + be produced even when `-Wreturn`, `-Wshort-desc`, and/or + `-Wcontents-before-sections` are used. + + Allowing those warnings to be filtered is complex, because it + would require storing them in a buffer and then filtering them + during the output step of the code, depending on the + selected symbols. + """ + if self.entry and self.entry not in self.entries: + for log_msg in self.entry.warnings: + self.config.warning(log_msg) + def reset_state(self, ln): """ Ancillary routine to create a new entry. It initializes all variables used by the state machine. """ - # - # Flush the warnings out before we proceed further - # - if self.entry and self.entry not in self.entries: - for log_msg in self.entry.warnings: - self.config.log.warning(log_msg) + self.emit_unused_warnings() self.entry = KernelEntry(self.config, self.fname, ln) @@ -1741,6 +1760,8 @@ class KernelDoc: # Hand this line to the appropriate state handler self.state_actions[self.state](self, ln, line) + self.emit_unused_warnings() + except OSError: self.config.log.error(f"Error: Cannot open file {self.fname}") -- cgit v1.2.3 From eba6ffd126cd52358181ed5a179644a161f9c65f Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 19 Jan 2026 13:05:01 +0100 Subject: docs: kdoc: move kernel-doc to tools/docs kernel-doc is the last documentation-related tool still living outside of the tools/docs directory; the time has come to move it over. [mchehab: fixed kdoc lib location] Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <311d17e403524349940a8b12de6b5e91e554b1f4.1768823489.git.mchehab+huawei@kernel.org> --- Documentation/conf.py | 2 +- Documentation/doc-guide/kernel-doc.rst | 8 +- Documentation/kbuild/kbuild.rst | 2 +- Documentation/process/coding-style.rst | 2 +- .../translations/it_IT/doc-guide/kernel-doc.rst | 8 +- .../translations/sp_SP/process/coding-style.rst | 2 +- .../translations/zh_CN/doc-guide/kernel-doc.rst | 10 +- Documentation/translations/zh_CN/kbuild/kbuild.rst | 2 +- .../translations/zh_CN/process/coding-style.rst | 2 +- .../translations/zh_TW/process/coding-style.rst | 2 +- MAINTAINERS | 2 - Makefile | 2 +- drivers/gpu/drm/i915/Makefile | 2 +- scripts/kernel-doc | 1 - scripts/kernel-doc.py | 360 --------------------- tools/docs/find-unused-docs.sh | 2 +- tools/docs/kernel-doc | 360 +++++++++++++++++++++ tools/docs/sphinx-build-wrapper | 2 +- 18 files changed, 384 insertions(+), 387 deletions(-) delete mode 120000 scripts/kernel-doc delete mode 100755 scripts/kernel-doc.py create mode 100755 tools/docs/kernel-doc (limited to 'tools') diff --git a/Documentation/conf.py b/Documentation/conf.py index 16d025af1f30..652be9221246 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -585,7 +585,7 @@ pdf_documents = [ # kernel-doc extension configuration for running Sphinx directly (e.g. by Read # the Docs). In a normal build, these are supplied from the Makefile via command # line arguments. -kerneldoc_bin = "../scripts/kernel-doc.py" +kerneldoc_bin = "../tools/docs/kernel-doc" # Not used now kerneldoc_srctree = ".." def setup(app): diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst index b56128d7f5c3..8d2c09fb36e4 100644 --- a/Documentation/doc-guide/kernel-doc.rst +++ b/Documentation/doc-guide/kernel-doc.rst @@ -54,7 +54,7 @@ Running the ``kernel-doc`` tool with increased verbosity and without actual output generation may be used to verify proper formatting of the documentation comments. For example:: - scripts/kernel-doc -v -none drivers/foo/bar.c + tools/docs/kernel-doc -v -none drivers/foo/bar.c The documentation format of ``.c`` files is also verified by the kernel build when it is requested to perform extra gcc checks:: @@ -365,7 +365,7 @@ differentiated by whether the macro name is immediately followed by a left parenthesis ('(') for function-like macros or not followed by one for object-like macros. -Function-like macros are handled like functions by ``scripts/kernel-doc``. +Function-like macros are handled like functions by ``tools/docs/kernel-doc``. They may have a parameter list. Object-like macros have do not have a parameter list. @@ -596,8 +596,8 @@ from the source file. The kernel-doc extension is included in the kernel source tree, at ``Documentation/sphinx/kerneldoc.py``. Internally, it uses the -``scripts/kernel-doc`` script to extract the documentation comments from the -source. +``tools/docs/kernel-doc`` script to extract the documentation comments from +the source. .. _kernel_doc: diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst index 82826b0332df..5a9013bacfb7 100644 --- a/Documentation/kbuild/kbuild.rst +++ b/Documentation/kbuild/kbuild.rst @@ -180,7 +180,7 @@ architecture. KDOCFLAGS --------- Specify extra (warning/error) flags for kernel-doc checks during the build, -see scripts/kernel-doc for which flags are supported. Note that this doesn't +see tools/docs/kernel-doc for which flags are supported. Note that this doesn't (currently) apply to documentation builds. ARCH diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 258158637f65..35b381230f6e 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -614,7 +614,7 @@ it. When commenting the kernel API functions, please use the kernel-doc format. See the files at :ref:`Documentation/doc-guide/ ` and -``scripts/kernel-doc`` for details. Note that the danger of over-commenting +``tools/docs/kernel-doc`` for details. Note that the danger of over-commenting applies to kernel-doc comments all the same. Do not add boilerplate kernel-doc which simply reiterates what's obvious from the signature of the function. diff --git a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst index aa0e31d353d6..bac959b8b7b9 100644 --- a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst +++ b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst @@ -80,7 +80,7 @@ Al fine di verificare che i commenti siano formattati correttamente, potete eseguire il programma ``kernel-doc`` con un livello di verbosità alto e senza che questo produca alcuna documentazione. Per esempio:: - scripts/kernel-doc -v -none drivers/foo/bar.c + tools/docs/kernel-doc -v -none drivers/foo/bar.c Il formato della documentazione è verificato della procedura di generazione del kernel quando viene richiesto di effettuare dei controlli extra con GCC:: @@ -378,7 +378,7 @@ distinguono in base al fatto che il nome della macro simile a funzione sia immediatamente seguito da una parentesi sinistra ('(') mentre in quelle simili a oggetti no. -Le macro simili a funzioni sono gestite come funzioni da ``scripts/kernel-doc``. +Le macro simili a funzioni sono gestite come funzioni da ``tools/docs/kernel-doc``. Possono avere un elenco di parametri. Le macro simili a oggetti non hanno un elenco di parametri. @@ -595,7 +595,7 @@ documentazione presenti nel file sorgente (*source*). L'estensione kernel-doc fa parte dei sorgenti del kernel, la si può trovare in ``Documentation/sphinx/kerneldoc.py``. Internamente, viene utilizzato -lo script ``scripts/kernel-doc`` per estrarre i commenti di documentazione +lo script ``tools/docs/kernel-doc`` per estrarre i commenti di documentazione dai file sorgenti. Come utilizzare kernel-doc per generare pagine man @@ -604,4 +604,4 @@ Come utilizzare kernel-doc per generare pagine man Se volete utilizzare kernel-doc solo per generare delle pagine man, potete farlo direttamente dai sorgenti del kernel:: - $ scripts/kernel-doc -man $(git grep -l '/\*\*' -- :^Documentation :^tools) | scripts/split-man.pl /tmp/man + $ tools/docs/kernel-doc -man $(git grep -l '/\*\*' -- :^Documentation :^tools) | scripts/split-man.pl /tmp/man diff --git a/Documentation/translations/sp_SP/process/coding-style.rst b/Documentation/translations/sp_SP/process/coding-style.rst index 025223be9706..7d63aa8426e6 100644 --- a/Documentation/translations/sp_SP/process/coding-style.rst +++ b/Documentation/translations/sp_SP/process/coding-style.rst @@ -633,7 +633,7 @@ posiblemente POR QUÉ hace esto. Al comentar las funciones de la API del kernel, utilice el formato kernel-doc. Consulte los archivos en :ref:`Documentation/doc-guide/ ` -y ``scripts/kernel-doc`` para más detalles. +y ``tools/docs/kernel-doc`` para más detalles. El estilo preferido para comentarios largos (de varias líneas) es: diff --git a/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst b/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst index ccfb9b8329c2..fb2bbaaa85c1 100644 --- a/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst +++ b/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst @@ -43,7 +43,7 @@ kernel-doc注释用 ``/**`` 作为开始标记。 ``kernel-doc`` 工具将提取 用详细模式和不生成实际输出来运行 ``kernel-doc`` 工具,可以验证文档注释的格式 是否正确。例如:: - scripts/kernel-doc -v -none drivers/foo/bar.c + tools/docs/kernel-doc -v -none drivers/foo/bar.c 当请求执行额外的gcc检查时,内核构建将验证文档格式:: @@ -473,7 +473,7 @@ doc: *title* 如果没有选项,kernel-doc指令将包含源文件中的所有文档注释。 kernel-doc扩展包含在内核源代码树中,位于 ``Documentation/sphinx/kerneldoc.py`` 。 -在内部,它使用 ``scripts/kernel-doc`` 脚本从源代码中提取文档注释。 +在内部,它使用 ``tools/docs/kernel-doc`` 脚本从源代码中提取文档注释。 .. _kernel_doc_zh: @@ -482,18 +482,18 @@ kernel-doc扩展包含在内核源代码树中,位于 ``Documentation/sphinx/k 如果您只想使用kernel-doc生成手册页,可以从内核git树这样做:: - $ scripts/kernel-doc -man \ + $ tools/docs/kernel-doc -man \ $(git grep -l '/\*\*' -- :^Documentation :^tools) \ | scripts/split-man.pl /tmp/man 一些旧版本的git不支持路径排除语法的某些变体。 以下命令之一可能适用于这些版本:: - $ scripts/kernel-doc -man \ + $ tools/docs/kernel-doc -man \ $(git grep -l '/\*\*' -- . ':!Documentation' ':!tools') \ | scripts/split-man.pl /tmp/man - $ scripts/kernel-doc -man \ + $ tools/docs/kernel-doc -man \ $(git grep -l '/\*\*' -- . ":(exclude)Documentation" ":(exclude)tools") \ | scripts/split-man.pl /tmp/man diff --git a/Documentation/translations/zh_CN/kbuild/kbuild.rst b/Documentation/translations/zh_CN/kbuild/kbuild.rst index 57f5cf5b2cdd..a477b4b08958 100644 --- a/Documentation/translations/zh_CN/kbuild/kbuild.rst +++ b/Documentation/translations/zh_CN/kbuild/kbuild.rst @@ -174,7 +174,7 @@ UTS_MACHINE 变量(在某些架构中还包括内核配置)来猜测正确 KDOCFLAGS --------- 指定在构建过程中用于 kernel-doc 检查的额外(警告/错误)标志,查看 -scripts/kernel-doc 了解支持的标志。请注意,这目前不适用于文档构建。 +tools/docs/kernel-doc 了解支持的标志。请注意,这目前不适用于文档构建。 ARCH ---- diff --git a/Documentation/translations/zh_CN/process/coding-style.rst b/Documentation/translations/zh_CN/process/coding-style.rst index 0484d0c65c25..5a342a024c01 100644 --- a/Documentation/translations/zh_CN/process/coding-style.rst +++ b/Documentation/translations/zh_CN/process/coding-style.rst @@ -545,7 +545,7 @@ Linux 里这是提倡的做法,因为这样可以很简单的给读者提供 也可以加上它做这些事情的原因。 当注释内核 API 函数时,请使用 kernel-doc 格式。详见 -Documentation/translations/zh_CN/doc-guide/index.rst 和 scripts/kernel-doc 。 +Documentation/translations/zh_CN/doc-guide/index.rst 和 tools/docs/kernel-doc 。 长 (多行) 注释的首选风格是: diff --git a/Documentation/translations/zh_TW/process/coding-style.rst b/Documentation/translations/zh_TW/process/coding-style.rst index 311c6f6bad0b..e2ba97b3d8bb 100644 --- a/Documentation/translations/zh_TW/process/coding-style.rst +++ b/Documentation/translations/zh_TW/process/coding-style.rst @@ -548,7 +548,7 @@ Linux 裏這是提倡的做法,因爲這樣可以很簡單的給讀者提供 也可以加上它做這些事情的原因。 當註釋內核 API 函數時,請使用 kernel-doc 格式。詳見 -Documentation/translations/zh_CN/doc-guide/index.rst 和 scripts/kernel-doc 。 +Documentation/translations/zh_CN/doc-guide/index.rst 和 tools/docs/kernel-doc 。 長 (多行) 註釋的首選風格是: diff --git a/MAINTAINERS b/MAINTAINERS index 02ec226dd571..d009e2da2215 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7523,7 +7523,6 @@ S: Maintained P: Documentation/doc-guide/maintainer-profile.rst T: git git://git.lwn.net/linux.git docs-next F: Documentation/ -F: scripts/kernel-doc* F: tools/lib/python/* F: tools/docs/ F: tools/net/ynl/pyynl/lib/doc_generator.py @@ -7561,7 +7560,6 @@ M: Mauro Carvalho Chehab L: linux-doc@vger.kernel.org S: Maintained F: Documentation/sphinx/ -F: scripts/kernel-doc* F: tools/lib/python/* F: tools/docs/ diff --git a/Makefile b/Makefile index 3cd00b62cde9..81a4ab11256c 100644 --- a/Makefile +++ b/Makefile @@ -460,7 +460,7 @@ HOSTPKG_CONFIG = pkg-config # the KERNELDOC macro needs to be exported, as scripts/Makefile.build # has a logic to call it -KERNELDOC = $(srctree)/scripts/kernel-doc.py +KERNELDOC = $(srctree)/tools/docs/kernel-doc export KERNELDOC KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 4db24050edb0..c979c579de66 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -443,7 +443,7 @@ always-$(CONFIG_DRM_I915_WERROR) += \ quiet_cmd_hdrtest = HDRTEST $(patsubst %.hdrtest,%.h,$@) cmd_hdrtest = $(CC) $(filter-out $(CFLAGS_GCOV), $(c_flags)) -S -o /dev/null -x c /dev/null -include $<; \ - $(srctree)/scripts/kernel-doc -none -Werror $<; touch $@ + $(KERNELDOC) -none -Werror $<; touch $@ $(obj)/%.hdrtest: $(src)/%.h FORCE $(call if_changed_dep,hdrtest) diff --git a/scripts/kernel-doc b/scripts/kernel-doc deleted file mode 120000 index 3b6ef807791a..000000000000 --- a/scripts/kernel-doc +++ /dev/null @@ -1 +0,0 @@ -kernel-doc.py \ No newline at end of file diff --git a/scripts/kernel-doc.py b/scripts/kernel-doc.py deleted file mode 100755 index 4e3b9cfe3fd7..000000000000 --- a/scripts/kernel-doc.py +++ /dev/null @@ -1,360 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 -# Copyright(c) 2025: Mauro Carvalho Chehab . -# -# pylint: disable=C0103,R0912,R0914,R0915 -# -# NOTE: While kernel-doc requires at least version 3.6 to run, the -# command line should work with Python 3.2+ (tested with 3.4). -# The rationale is that it shall fail gracefully during Kernel -# compilation with older Kernel versions. Due to that: -# - encoding line is needed here; -# - f-strings cannot be used in this file. -# - libraries that require newer versions can only be included -# after the Python version has been checked. -# -# Converted from the kernel-doc script originally written in Perl -# under GPLv2, copyrighted since 1998 by the following authors: -# -# Aditya Srivastava -# Akira Yokosawa -# Alexander A. Klimov -# Alexander Lobakin -# André Almeida -# Andy Shevchenko -# Anna-Maria Behnsen -# Armin Kuster -# Bart Van Assche -# Ben Hutchings -# Borislav Petkov -# Chen-Yu Tsai -# Coco Li -# Conchúr Navid -# Daniel Santos -# Danilo Cesar Lemes de Paula -# Dan Luedtke -# Donald Hunter -# Gabriel Krisman Bertazi -# Greg Kroah-Hartman -# Harvey Harrison -# Horia Geanta -# Ilya Dryomov -# Jakub Kicinski -# Jani Nikula -# Jason Baron -# Jason Gunthorpe -# Jérémy Bobbio -# Johannes Berg -# Johannes Weiner -# Jonathan Cameron -# Jonathan Corbet -# Jonathan Neuschäfer -# Kamil Rytarowski -# Kees Cook -# Laurent Pinchart -# Levin, Alexander (Sasha Levin) -# Linus Torvalds -# Lucas De Marchi -# Mark Rutland -# Markus Heiser -# Martin Waitz -# Masahiro Yamada -# Matthew Wilcox -# Mauro Carvalho Chehab -# Michal Wajdeczko -# Michael Zucchi -# Mike Rapoport -# Niklas Söderlund -# Nishanth Menon -# Paolo Bonzini -# Pavan Kumar Linga -# Pavel Pisa -# Peter Maydell -# Pierre-Louis Bossart -# Randy Dunlap -# Richard Kennedy -# Rich Walker -# Rolf Eike Beer -# Sakari Ailus -# Silvio Fricke -# Simon Huggins -# Tim Waugh -# Tomasz Warniełło -# Utkarsh Tripathi -# valdis.kletnieks@vt.edu -# Vegard Nossum -# Will Deacon -# Yacine Belkadi -# Yujie Liu - -""" -Print formatted kernel documentation to stdout. - -Read C language source or header FILEs, extract embedded -documentation comments, and print formatted documentation -to standard output. - -The documentation comments are identified by the ``/**`` -opening comment mark. - -See Documentation/doc-guide/kernel-doc.rst for the -documentation comment syntax. -""" - -import argparse -import logging -import os -import sys - -# Import Python modules - -LIB_DIR = "../tools/lib/python" -SRC_DIR = os.path.dirname(os.path.realpath(__file__)) - -sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) - -WERROR_RETURN_CODE = 3 - -DESC = """ -Read C language source or header FILEs, extract embedded documentation comments, -and print formatted documentation to standard output. - -The documentation comments are identified by the "/**" opening comment mark. - -See Documentation/doc-guide/kernel-doc.rst for the documentation comment syntax. -""" - -EXPORT_FILE_DESC = """ -Specify an additional FILE in which to look for EXPORT_SYMBOL information. - -May be used multiple times. -""" - -EXPORT_DESC = """ -Only output documentation for symbols that have been -exported using EXPORT_SYMBOL() and related macros in any input -FILE or -export-file FILE. -""" - -INTERNAL_DESC = """ -Only output documentation for symbols that have NOT been -exported using EXPORT_SYMBOL() and related macros in any input -FILE or -export-file FILE. -""" - -FUNCTION_DESC = """ -Only output documentation for the given function or DOC: section -title. All other functions and DOC: sections are ignored. - -May be used multiple times. -""" - -NOSYMBOL_DESC = """ -Exclude the specified symbol from the output documentation. - -May be used multiple times. -""" - -FILES_DESC = """ -Header and C source files to be parsed. -""" - -WARN_CONTENTS_BEFORE_SECTIONS_DESC = """ -Warn if there are contents before sections (deprecated). - -This option is kept just for backward-compatibility, but it does nothing, -neither here nor at the original Perl script. -""" - - -class MsgFormatter(logging.Formatter): - """Helper class to format warnings in a similar way to kernel-doc.pl.""" - - def format(self, record): - record.levelname = record.levelname.capitalize() - return logging.Formatter.format(self, record) - -def main(): - """ - Main program. - - By default, the return value is: - - - 0: success or Python version is not compatible with - kernel-doc. If -Werror is not used, it will also - return 0 if there are issues at kernel-doc markups; - - - 1: an abnormal condition happened; - - - 2: argparse issued an error; - - - 3: -Werror is used, and one or more unfiltered parse warnings happened. - """ - - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, - description=DESC) - - # - # Normal arguments - # - parser.add_argument("-v", "-verbose", "--verbose", action="store_true", - help="Verbose output, more warnings and other information.") - - parser.add_argument("-d", "-debug", "--debug", action="store_true", - help="Enable debug messages") - - parser.add_argument("-M", "-modulename", "--modulename", - default="Kernel API", - help="Allow setting a module name at the output.") - - parser.add_argument("-l", "-enable-lineno", "--enable_lineno", - action="store_true", - help="Enable line number output (only in ReST mode)") - - # - # Arguments to control the warning behavior - # - parser.add_argument("-Wreturn", "--wreturn", action="store_true", - help="Warns about the lack of a return markup on functions.") - - parser.add_argument("-Wshort-desc", "-Wshort-description", "--wshort-desc", - action="store_true", - help="Warns if initial short description is missing") - - parser.add_argument("-Wcontents-before-sections", - "--wcontents-before-sections", action="store_true", - help=WARN_CONTENTS_BEFORE_SECTIONS_DESC) - - parser.add_argument("-Wall", "--wall", action="store_true", - help="Enable all types of warnings") - - parser.add_argument("-Werror", "--werror", action="store_true", - help="Treat warnings as errors.") - - parser.add_argument("-export-file", "--export-file", action='append', - help=EXPORT_FILE_DESC) - - # - # Output format mutually-exclusive group - # - out_group = parser.add_argument_group("Output format selection (mutually exclusive)") - - out_fmt = out_group.add_mutually_exclusive_group() - - out_fmt.add_argument("-m", "-man", "--man", action="store_true", - help="Output troff manual page format.") - out_fmt.add_argument("-r", "-rst", "--rst", action="store_true", - help="Output reStructuredText format (default).") - out_fmt.add_argument("-N", "-none", "--none", action="store_true", - help="Do not output documentation, only warnings.") - - # - # Output selection mutually-exclusive group - # - sel_group = parser.add_argument_group("Output selection (mutually exclusive)") - sel_mut = sel_group.add_mutually_exclusive_group() - - sel_mut.add_argument("-e", "-export", "--export", action='store_true', - help=EXPORT_DESC) - - sel_mut.add_argument("-i", "-internal", "--internal", action='store_true', - help=INTERNAL_DESC) - - sel_mut.add_argument("-s", "-function", "--symbol", action='append', - help=FUNCTION_DESC) - - # - # Those are valid for all 3 types of filter - # - parser.add_argument("-n", "-nosymbol", "--nosymbol", action='append', - help=NOSYMBOL_DESC) - - parser.add_argument("-D", "-no-doc-sections", "--no-doc-sections", - action='store_true', help="Don't output DOC sections") - - parser.add_argument("files", metavar="FILE", - nargs="+", help=FILES_DESC) - - args = parser.parse_args() - - if args.wall: - args.wreturn = True - args.wshort_desc = True - args.wcontents_before_sections = True - - logger = logging.getLogger() - - if not args.debug: - logger.setLevel(logging.INFO) - else: - logger.setLevel(logging.DEBUG) - - formatter = MsgFormatter('%(levelname)s: %(message)s') - - handler = logging.StreamHandler() - handler.setFormatter(formatter) - - logger.addHandler(handler) - - python_ver = sys.version_info[:2] - if python_ver < (3,6): - # - # Depending on the Kernel configuration, kernel-doc --none is called at - # build time. As we don't want to break compilation due to the - # usage of an old Python version, return 0 here. - # - if args.none: - logger.error("Python 3.6 or later is required by kernel-doc. Skipping checks") - sys.exit(0) - - sys.exit("Python 3.6 or later is required by kernel-doc. Aborting.") - - if python_ver < (3,7): - logger.warning("Python 3.7 or later is required for correct results") - - # - # Import kernel-doc libraries only after checking the Python version - # - from kdoc.kdoc_files import KernelFiles # pylint: disable=C0415 - from kdoc.kdoc_output import RestFormat, ManFormat # pylint: disable=C0415 - - if args.man: - out_style = ManFormat(modulename=args.modulename) - elif args.none: - out_style = None - else: - out_style = RestFormat() - - kfiles = KernelFiles(verbose=args.verbose, - out_style=out_style, werror=args.werror, - wreturn=args.wreturn, wshort_desc=args.wshort_desc, - wcontents_before_sections=args.wcontents_before_sections) - - kfiles.parse(args.files, export_file=args.export_file) - - for t in kfiles.msg(enable_lineno=args.enable_lineno, export=args.export, - internal=args.internal, symbol=args.symbol, - nosymbol=args.nosymbol, export_file=args.export_file, - no_doc_sections=args.no_doc_sections): - msg = t[1] - if msg: - print(msg) - - error_count = kfiles.errors - if not error_count: - sys.exit(0) - - if args.werror: - print("%s warnings as errors" % error_count) # pylint: disable=C0209 - sys.exit(WERROR_RETURN_CODE) - - if args.verbose: - print("%s errors" % error_count) # pylint: disable=C0209 - - sys.exit(0) - -# -# Call main method -# -if __name__ == "__main__": - main() diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh index ca4e607ec3f7..53514c759dc1 100755 --- a/tools/docs/find-unused-docs.sh +++ b/tools/docs/find-unused-docs.sh @@ -54,7 +54,7 @@ for file in `find $1 -name '*.c'`; do if [[ ${FILES_INCLUDED[$file]+_} ]]; then continue; fi - str=$(PYTHONDONTWRITEBYTECODE=1 scripts/kernel-doc -export "$file" 2>/dev/null) + str=$(PYTHONDONTWRITEBYTECODE=1 tools/docs/kernel-doc -export "$file" 2>/dev/null) if [[ -n "$str" ]]; then echo "$file" fi diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc new file mode 100755 index 000000000000..a19a92566780 --- /dev/null +++ b/tools/docs/kernel-doc @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab . +# +# pylint: disable=C0103,R0912,R0914,R0915 +# +# NOTE: While kernel-doc requires at least version 3.6 to run, the +# command line should work with Python 3.2+ (tested with 3.4). +# The rationale is that it shall fail gracefully during Kernel +# compilation with older Kernel versions. Due to that: +# - encoding line is needed here; +# - f-strings cannot be used in this file. +# - libraries that require newer versions can only be included +# after the Python version has been checked. +# +# Converted from the kernel-doc script originally written in Perl +# under GPLv2, copyrighted since 1998 by the following authors: +# +# Aditya Srivastava +# Akira Yokosawa +# Alexander A. Klimov +# Alexander Lobakin +# André Almeida +# Andy Shevchenko +# Anna-Maria Behnsen +# Armin Kuster +# Bart Van Assche +# Ben Hutchings +# Borislav Petkov +# Chen-Yu Tsai +# Coco Li +# Conchúr Navid +# Daniel Santos +# Danilo Cesar Lemes de Paula +# Dan Luedtke +# Donald Hunter +# Gabriel Krisman Bertazi +# Greg Kroah-Hartman +# Harvey Harrison +# Horia Geanta +# Ilya Dryomov +# Jakub Kicinski +# Jani Nikula +# Jason Baron +# Jason Gunthorpe +# Jérémy Bobbio +# Johannes Berg +# Johannes Weiner +# Jonathan Cameron +# Jonathan Corbet +# Jonathan Neuschäfer +# Kamil Rytarowski +# Kees Cook +# Laurent Pinchart +# Levin, Alexander (Sasha Levin) +# Linus Torvalds +# Lucas De Marchi +# Mark Rutland +# Markus Heiser +# Martin Waitz +# Masahiro Yamada +# Matthew Wilcox +# Mauro Carvalho Chehab +# Michal Wajdeczko +# Michael Zucchi +# Mike Rapoport +# Niklas Söderlund +# Nishanth Menon +# Paolo Bonzini +# Pavan Kumar Linga +# Pavel Pisa +# Peter Maydell +# Pierre-Louis Bossart +# Randy Dunlap +# Richard Kennedy +# Rich Walker +# Rolf Eike Beer +# Sakari Ailus +# Silvio Fricke +# Simon Huggins +# Tim Waugh +# Tomasz Warniełło +# Utkarsh Tripathi +# valdis.kletnieks@vt.edu +# Vegard Nossum +# Will Deacon +# Yacine Belkadi +# Yujie Liu + +""" +Print formatted kernel documentation to stdout. + +Read C language source or header FILEs, extract embedded +documentation comments, and print formatted documentation +to standard output. + +The documentation comments are identified by the ``/**`` +opening comment mark. + +See Documentation/doc-guide/kernel-doc.rst for the +documentation comment syntax. +""" + +import argparse +import logging +import os +import sys + +# Import Python modules + +LIB_DIR = "../lib/python" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +WERROR_RETURN_CODE = 3 + +DESC = """ +Read C language source or header FILEs, extract embedded documentation comments, +and print formatted documentation to standard output. + +The documentation comments are identified by the "/**" opening comment mark. + +See Documentation/doc-guide/kernel-doc.rst for the documentation comment syntax. +""" + +EXPORT_FILE_DESC = """ +Specify an additional FILE in which to look for EXPORT_SYMBOL information. + +May be used multiple times. +""" + +EXPORT_DESC = """ +Only output documentation for symbols that have been +exported using EXPORT_SYMBOL() and related macros in any input +FILE or -export-file FILE. +""" + +INTERNAL_DESC = """ +Only output documentation for symbols that have NOT been +exported using EXPORT_SYMBOL() and related macros in any input +FILE or -export-file FILE. +""" + +FUNCTION_DESC = """ +Only output documentation for the given function or DOC: section +title. All other functions and DOC: sections are ignored. + +May be used multiple times. +""" + +NOSYMBOL_DESC = """ +Exclude the specified symbol from the output documentation. + +May be used multiple times. +""" + +FILES_DESC = """ +Header and C source files to be parsed. +""" + +WARN_CONTENTS_BEFORE_SECTIONS_DESC = """ +Warn if there are contents before sections (deprecated). + +This option is kept just for backward-compatibility, but it does nothing, +neither here nor at the original Perl script. +""" + + +class MsgFormatter(logging.Formatter): + """Helper class to format warnings in a similar way to kernel-doc.pl.""" + + def format(self, record): + record.levelname = record.levelname.capitalize() + return logging.Formatter.format(self, record) + +def main(): + """ + Main program. + + By default, the return value is: + + - 0: success or Python version is not compatible with + kernel-doc. If -Werror is not used, it will also + return 0 if there are issues at kernel-doc markups; + + - 1: an abnormal condition happened; + + - 2: argparse issued an error; + + - 3: -Werror is used, and one or more unfiltered parse warnings happened. + """ + + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, + description=DESC) + + # + # Normal arguments + # + parser.add_argument("-v", "-verbose", "--verbose", action="store_true", + help="Verbose output, more warnings and other information.") + + parser.add_argument("-d", "-debug", "--debug", action="store_true", + help="Enable debug messages") + + parser.add_argument("-M", "-modulename", "--modulename", + default="Kernel API", + help="Allow setting a module name at the output.") + + parser.add_argument("-l", "-enable-lineno", "--enable_lineno", + action="store_true", + help="Enable line number output (only in ReST mode)") + + # + # Arguments to control the warning behavior + # + parser.add_argument("-Wreturn", "--wreturn", action="store_true", + help="Warns about the lack of a return markup on functions.") + + parser.add_argument("-Wshort-desc", "-Wshort-description", "--wshort-desc", + action="store_true", + help="Warns if initial short description is missing") + + parser.add_argument("-Wcontents-before-sections", + "--wcontents-before-sections", action="store_true", + help=WARN_CONTENTS_BEFORE_SECTIONS_DESC) + + parser.add_argument("-Wall", "--wall", action="store_true", + help="Enable all types of warnings") + + parser.add_argument("-Werror", "--werror", action="store_true", + help="Treat warnings as errors.") + + parser.add_argument("-export-file", "--export-file", action='append', + help=EXPORT_FILE_DESC) + + # + # Output format mutually-exclusive group + # + out_group = parser.add_argument_group("Output format selection (mutually exclusive)") + + out_fmt = out_group.add_mutually_exclusive_group() + + out_fmt.add_argument("-m", "-man", "--man", action="store_true", + help="Output troff manual page format.") + out_fmt.add_argument("-r", "-rst", "--rst", action="store_true", + help="Output reStructuredText format (default).") + out_fmt.add_argument("-N", "-none", "--none", action="store_true", + help="Do not output documentation, only warnings.") + + # + # Output selection mutually-exclusive group + # + sel_group = parser.add_argument_group("Output selection (mutually exclusive)") + sel_mut = sel_group.add_mutually_exclusive_group() + + sel_mut.add_argument("-e", "-export", "--export", action='store_true', + help=EXPORT_DESC) + + sel_mut.add_argument("-i", "-internal", "--internal", action='store_true', + help=INTERNAL_DESC) + + sel_mut.add_argument("-s", "-function", "--symbol", action='append', + help=FUNCTION_DESC) + + # + # Those are valid for all 3 types of filter + # + parser.add_argument("-n", "-nosymbol", "--nosymbol", action='append', + help=NOSYMBOL_DESC) + + parser.add_argument("-D", "-no-doc-sections", "--no-doc-sections", + action='store_true', help="Don't output DOC sections") + + parser.add_argument("files", metavar="FILE", + nargs="+", help=FILES_DESC) + + args = parser.parse_args() + + if args.wall: + args.wreturn = True + args.wshort_desc = True + args.wcontents_before_sections = True + + logger = logging.getLogger() + + if not args.debug: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.DEBUG) + + formatter = MsgFormatter('%(levelname)s: %(message)s') + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger.addHandler(handler) + + python_ver = sys.version_info[:2] + if python_ver < (3,6): + # + # Depending on the Kernel configuration, kernel-doc --none is called at + # build time. As we don't want to break compilation due to the + # usage of an old Python version, return 0 here. + # + if args.none: + logger.error("Python 3.6 or later is required by kernel-doc. Skipping checks") + sys.exit(0) + + sys.exit("Python 3.6 or later is required by kernel-doc. Aborting.") + + if python_ver < (3,7): + logger.warning("Python 3.7 or later is required for correct results") + + # + # Import kernel-doc libraries only after checking the Python version + # + from kdoc.kdoc_files import KernelFiles # pylint: disable=C0415 + from kdoc.kdoc_output import RestFormat, ManFormat # pylint: disable=C0415 + + if args.man: + out_style = ManFormat(modulename=args.modulename) + elif args.none: + out_style = None + else: + out_style = RestFormat() + + kfiles = KernelFiles(verbose=args.verbose, + out_style=out_style, werror=args.werror, + wreturn=args.wreturn, wshort_desc=args.wshort_desc, + wcontents_before_sections=args.wcontents_before_sections) + + kfiles.parse(args.files, export_file=args.export_file) + + for t in kfiles.msg(enable_lineno=args.enable_lineno, export=args.export, + internal=args.internal, symbol=args.symbol, + nosymbol=args.nosymbol, export_file=args.export_file, + no_doc_sections=args.no_doc_sections): + msg = t[1] + if msg: + print(msg) + + error_count = kfiles.errors + if not error_count: + sys.exit(0) + + if args.werror: + print("%s warnings as errors" % error_count) # pylint: disable=C0209 + sys.exit(WERROR_RETURN_CODE) + + if args.verbose: + print("%s errors" % error_count) # pylint: disable=C0209 + + sys.exit(0) + +# +# Call main method +# +if __name__ == "__main__": + main() diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index 7a5fcef25429..cb2a5005e633 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -246,7 +246,7 @@ class SphinxBuilder: # self.sphinxbuild = os.environ.get("SPHINXBUILD", "sphinx-build") self.kerneldoc = self.get_path(os.environ.get("KERNELDOC", - "scripts/kernel-doc.py")) + "tools/docs/kernel-doc")) self.builddir = self.get_path(builddir, use_cwd=True, abs_path=True) # -- cgit v1.2.3 From 32e9a42440a230b14c438099bc5fccb5012a638a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 13:05:02 +0100 Subject: docs: kdoc: move the return values to the helper message It makes sense to describe what kernel-doc is expected to return on its help message. Move such messages to argparse epilog. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <3bcfa48016770929fcd073376515e3ff0b777ea8.1768823489.git.mchehab+huawei@kernel.org> --- tools/docs/kernel-doc | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc index a19a92566780..902397804e80 100755 --- a/tools/docs/kernel-doc +++ b/tools/docs/kernel-doc @@ -166,6 +166,20 @@ This option is kept just for backward-compatibility, but it does nothing, neither here nor at the original Perl script. """ +EPILOG = """ +The return value is: + +- 0: success or Python version is not compatible with +kernel-doc. If -Werror is not used, it will also +return 0 if there are issues at kernel-doc markups; + +- 1: an abnormal condition happened; + +- 2: argparse issued an error; + +- 3: When -Werror is used, it means that one or more unfiltered parse + warnings happened. +""" class MsgFormatter(logging.Formatter): """Helper class to format warnings in a similar way to kernel-doc.pl.""" @@ -178,21 +192,10 @@ def main(): """ Main program. - By default, the return value is: - - - 0: success or Python version is not compatible with - kernel-doc. If -Werror is not used, it will also - return 0 if there are issues at kernel-doc markups; - - - 1: an abnormal condition happened; - - - 2: argparse issued an error; - - - 3: -Werror is used, and one or more unfiltered parse warnings happened. """ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, - description=DESC) + description=DESC, epilog=EPILOG) # # Normal arguments -- cgit v1.2.3 From 35c0f975ef4a96cb488bcb5fca6e852fc347bc49 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 13:05:03 +0100 Subject: docs: kdoc: improve description of MsgFormatter The description there is quite vague. Make it clearer. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <94269990e2d665bec08a1b6f4d28d84939cb9d83.1768823489.git.mchehab+huawei@kernel.org> --- tools/docs/kernel-doc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc index 902397804e80..aed09f9a54dd 100755 --- a/tools/docs/kernel-doc +++ b/tools/docs/kernel-doc @@ -182,7 +182,10 @@ return 0 if there are issues at kernel-doc markups; """ class MsgFormatter(logging.Formatter): - """Helper class to format warnings in a similar way to kernel-doc.pl.""" + """ + Helper class to capitalize errors and warnings, the same way + the venerable (now retired) kernel-doc.pl used to do. + """ def format(self, record): record.levelname = record.levelname.capitalize() -- cgit v1.2.3 From 4a3efd128f7da996b677151790d043ec44a00561 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 20 Jan 2026 15:50:38 -0700 Subject: docs: sphinx-build-wrapper: stop setting kerneldoc_bin for Sphinx Now that the Sphinx build does not use the kerneldoc_bin configuration variable, we shouldn't try to set it in the build wrapper or we get a nifty warning: WARNING: unknown config value 'kerneldoc_bin' in override, ignoring Signed-off-by: Jonathan Corbet --- tools/docs/sphinx-build-wrapper | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index cb2a5005e633..9f1ae1485f84 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -750,7 +750,6 @@ class SphinxBuilder: build_args = args + [ "-d", doctree_dir, - "-D", f"kerneldoc_bin={kerneldoc}", "-D", f"version={self.kernelversion}", "-D", f"release={self.kernelrelease}", "-D", f"kerneldoc_srctree={self.srctree}", -- cgit v1.2.3 From b2664a90c171846fcd572d93f6f21459721a1d2e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 12 Jan 2026 09:19:49 -0700 Subject: jobserver: Split up the big try: block The parsing of jobserver options is done in a massive try: block that hides problems and (perhaps) bugs. Split up that block and make the logic explicit by moving the initial parsing of MAKEFLAGS out of that block. Add warnings in the places things can go wrong. Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- tools/lib/python/jobserver.py | 143 ++++++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 53 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index 616411087725..a7c70ff4c375 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -35,6 +35,9 @@ import os import subprocess import sys +def warn(text, *args): + print(f'WARNING: {text}', *args, file = sys.stderr) + class JobserverExec: """ Claim all slots from make using POSIX Jobserver. @@ -58,64 +61,98 @@ class JobserverExec: if self.is_open: return - - try: - # Fetch the make environment options. - flags = os.environ["MAKEFLAGS"] - # Look for "--jobserver=R,W" - # Note that GNU Make has used --jobserver-fds and --jobserver-auth - # so this handles all of them. - opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] - - # Parse out R,W file descriptor numbers and set them nonblocking. - # If the MAKEFLAGS variable contains multiple instances of the - # --jobserver-auth= option, the last one is relevant. - fds = opts[-1].split("=", 1)[1] - - # Starting with GNU Make 4.4, named pipes are used for reader - # and writer. - # Example argument: --jobserver-auth=fifo:/tmp/GMfifo8134 - _, _, path = fds.partition("fifo:") - - if path: + self.is_open = True # We only try once + self.claim = None + # + # Check the make flags for "--jobserver=R,W" + # Note that GNU Make has used --jobserver-fds and --jobserver-auth + # so this handles all of them. + # + flags = os.environ.get('MAKEFLAGS', '') + opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] + if not opts: + return + # + # Separate out the provided file descriptors + # + split_opt = opts[-1].split('=', 1) + if len(split_opt) != 2: + warn('unparseable option:', opts[-1]) + return + fds = split_opt[1] + # + # As of GNU Make 4.4, we'll be looking for a named pipe + # identified as fifo:path + # + if fds.startswith('fifo:'): + path = fds[len('fifo:'):] + try: self.reader = os.open(path, os.O_RDONLY | os.O_NONBLOCK) self.writer = os.open(path, os.O_WRONLY) - else: - self.reader, self.writer = [int(x) for x in fds.split(",", 1)] + except (OSError, IOError): + warn('unable to open jobserver pipe', path) + return + # + # Otherwise look for integer file-descriptor numbers. + # + else: + split_fds = fds.split(',') + if len(split_fds) != 2: + warn('malformed jobserver file descriptors:', fds) + return + try: + self.reader = int(split_fds[0]) + self.writer = int(split_fds[1]) + except ValueError: + warn('non-integer jobserver file-descriptors:', fds) + return + try: + # # Open a private copy of reader to avoid setting nonblocking # on an unexpecting process with the same reader fd. - self.reader = os.open("/proc/self/fd/%d" % (self.reader), + # + self.reader = os.open(f"/proc/self/fd/{self.reader}", os.O_RDONLY | os.O_NONBLOCK) - - # Read out as many jobserver slots as possible - while True: - try: - slot = os.read(self.reader, 8) - if not slot: - # Clear self.jobs to prevent us from probably writing incorrect file. - self.jobs = b"" - raise ValueError("unexpected empty token from jobserver fd, invalid '--jobserver-auth=' setting?") - self.jobs += slot - except (OSError, IOError) as e: - if e.errno == errno.EWOULDBLOCK: - # Stop at the end of the jobserver queue. - break - # If something went wrong, give back the jobs. - if self.jobs: - os.write(self.writer, self.jobs) - raise e - - # Add a bump for our caller's reserveration, since we're just going - # to sit here blocked on our child. - self.claim = len(self.jobs) + 1 - - except (KeyError, IndexError, ValueError, OSError, IOError) as e: - print(f"jobserver: warning: {repr(e)}", file=sys.stderr) - # Any missing environment strings or bad fds should result in just - # not being parallel. - self.claim = None - - self.is_open = True + except (IOError, OSError) as e: + warn('Unable to reopen jobserver read-side pipe:', repr(e)) + return + # + # OK, we have the channel to the job server; read out as many jobserver + # slots as possible. + # + while True: + try: + slot = os.read(self.reader, 8) + if not slot: + # + # Something went wrong. Clear self.jobs to avoid writing + # weirdness back to the jobserver and give up. + self.jobs = b"" + warn("unexpected empty token from jobserver;" + " possible invalid '--jobserver-auth=' setting") + self.claim = None + return + except (OSError, IOError) as e: + # + # If there is nothing more to read then we are done. + # + if e.errno == errno.EWOULDBLOCK: + break + # + # Anything else says that something went weird; give back + # the jobs and give up. + # + if self.jobs: + os.write(self.writer, self.jobs) + self.claim = None + warn('error reading from jobserver pipe', repr(e)) + return + self.jobs += slot + # + # Add a bump for our caller's reserveration, since we're just going + # to sit here blocked on our child. + # + self.claim = len(self.jobs) + 1 def close(self): """Return all reserved slots to Jobserver""" -- cgit v1.2.3 From 01ea38942bdcd28a7962d49d6f3a602979b81009 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 20 Jan 2026 17:47:09 -0300 Subject: perf tests sw-clock: Mark the volatile tmp variable as __maybe_unused As it is just used to waste some cycles, not being used as all, to silence some compilers. Noticed with gcc version 16.0.1 20260115 on fedora 44: tests/sw-clock.c: In function '__test__sw_clock_freq': tests/sw-clock.c:31:22: error: variable 'tmp' set but not used [-Werror=unused-but-set-variable=] 31 | volatile int tmp = 0; | ^~~ Reviewed-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sw-clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 4a2ad7176fa0..b6e46975379c 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "tests.h" @@ -28,7 +29,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) { int i, err = -1; - volatile int tmp = 0; + volatile int tmp __maybe_unused = 0; u64 total_periods = 0; int nr_samples = 0; char sbuf[STRERR_BUFSIZE]; -- cgit v1.2.3 From 2c850606a46b319d5128bda59f67b1fc642d94ef Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 20 Jan 2026 17:57:43 -0300 Subject: perf trace: Deal with compiler const checks The strchr() function these days return const/non-const based on the arg it receives, and sometimes we need to use casts when we're dealing with variables that are used in code that needs to safely change the returned value and sometimes not (as it points to really const areas). Tweak one such case. Reviewed-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 58a32adafddf..8df5ca44e4f9 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -5190,7 +5190,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, } while (1) { - if ((sep = strchr(s, ',')) != NULL) + if ((sep = strchr((char *)s, ',')) != NULL) *sep = '\0'; list = 0; -- cgit v1.2.3 From 2583e81fd8852e6cf6730c4463b6580deb7a8aad Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:29 -0800 Subject: resolve_btfids: Introduce finalize_btf() step Since recently [1][2] resolve_btfids executes final adjustments to the kernel/module BTF before it's embedded into the target binary. To keep the implementation simple, a clear and stable "pipeline" of how BTF data flows through resolve_btfids would be helpful. Some BTF modifications may change the ids of the types, so it is important to maintain correct order of operations with respect to .BTF_ids resolution too. This patch refactors the BTF handling to establish the following sequence: - load target ELF sections - load .BTF_ids symbols - this will be a dependency of btf2btf transformations in subsequent patches - load BTF and its base as is - (*) btf2btf transformations will happen here - finalize_btf(), introduced in this patch - does distill base and sort BTF - resolve and patch .BTF_ids This approach helps to avoid fixups in .BTF_ids data in case the ids change at any point of BTF processing, because symbol resolution happens on the finalized, ready to dump, BTF data. This also gives flexibility in BTF transformations, because they will happen on BTF that is not distilled and/or sorted yet, allowing to freely add, remove and modify BTF types. [1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/ [2] https://lore.kernel.org/bpf/20260109130003.3313716-1-dolinux.peng@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-5-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/main.c | 69 ++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 343d08050116..1fcf37af6764 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -563,19 +563,6 @@ static int load_btf(struct object *obj) obj->base_btf = base_btf; obj->btf = btf; - if (obj->base_btf && obj->distill_base) { - err = btf__distill_base(obj->btf, &base_btf, &btf); - if (err) { - pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); - goto out_err; - } - - btf__free(obj->base_btf); - btf__free(obj->btf); - obj->base_btf = base_btf; - obj->btf = btf; - } - return 0; out_err: @@ -911,6 +898,41 @@ out: return err; } +static int finalize_btf(struct object *obj) +{ + struct btf *base_btf = obj->base_btf, *btf = obj->btf; + int err; + + if (obj->base_btf && obj->distill_base) { + err = btf__distill_base(obj->btf, &base_btf, &btf); + if (err) { + pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); + goto out_err; + } + + btf__free(obj->base_btf); + btf__free(obj->btf); + obj->base_btf = base_btf; + obj->btf = btf; + } + + err = sort_btf_by_name(obj->btf); + if (err) { + pr_err("FAILED to sort BTF: %s\n", strerror(errno)); + goto out_err; + } + + return 0; + +out_err: + btf__free(base_btf); + btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; + + return err; +} + static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) { int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); @@ -1054,6 +1076,7 @@ int main(int argc, const char **argv) }; const char *btfids_path = NULL; bool fatal_warnings = false; + bool resolve_btfids = true; char out_path[PATH_MAX]; struct option btfid_options[] = { @@ -1083,12 +1106,6 @@ int main(int argc, const char **argv) if (btfids_path) return patch_btfids(btfids_path, obj.path); - if (load_btf(&obj)) - goto out; - - if (sort_btf_by_name(obj.btf)) - goto out; - if (elf_collect(&obj)) goto out; @@ -1099,12 +1116,22 @@ int main(int argc, const char **argv) if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n"); - goto dump_btf; + resolve_btfids = false; } - if (symbols_collect(&obj)) + if (resolve_btfids) + if (symbols_collect(&obj)) + goto out; + + if (load_btf(&obj)) goto out; + if (finalize_btf(&obj)) + goto out; + + if (!resolve_btfids) + goto dump_btf; + if (symbols_resolve(&obj)) goto out; -- cgit v1.2.3 From 9d199965990c0b21160c565076b93725d6638c28 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:30 -0800 Subject: resolve_btfids: Support for KF_IMPLICIT_ARGS Implement BTF modifications in resolve_btfids to support BPF kernel functions with implicit arguments. For a kfunc marked with KF_IMPLICIT_ARGS flag, a new function prototype is added to BTF that does not have implicit arguments. The kfunc's prototype is then updated to a new one in BTF. This prototype is the intended interface for the BPF programs. A _impl function is added to BTF to make the original kfunc prototype searchable for the BPF verifier. If a _impl function already exists in BTF, its interpreted as a legacy case, and this step is skipped. Whether an argument is implicit is determined by its type: currently only `struct bpf_prog_aux *` is supported. As a result, the BTF associated with kfunc is changed from __bpf_kfunc bpf_foo(int arg1, struct bpf_prog_aux *aux); into bpf_foo_impl(int arg1, struct bpf_prog_aux *aux); __bpf_kfunc bpf_foo(int arg1); For more context see previous discussions and patches [1][2]. [1] https://lore.kernel.org/dwarves/ba1650aa-fafd-49a8-bea4-bdddee7c38c9@linux.dev/ [2] https://lore.kernel.org/bpf/20251029190113.3323406-1-ihor.solodrai@linux.dev/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-6-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/main.c | 382 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 1fcf37af6764..db8d1554bdcc 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -152,6 +152,25 @@ struct object { int nr_typedefs; }; +#define KF_IMPLICIT_ARGS (1 << 16) +#define KF_IMPL_SUFFIX "_impl" + +struct kfunc { + const char *name; + u32 btf_id; + u32 flags; +}; + +struct btf2btf_context { + struct btf *btf; + u32 *decl_tags; + u32 nr_decl_tags; + u32 max_decl_tags; + struct kfunc *kfuncs; + u32 nr_kfuncs; + u32 max_kfuncs; +}; + static int verbose; static int warnings; @@ -837,6 +856,366 @@ static int dump_raw_btf(struct btf *btf, const char *out_path) return 0; } +static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, s32 type_id) +{ + const struct btf_type *t = btf__type_by_id(btf, type_id); + + while (btf_is_mod(t)) + t = btf__type_by_id(btf, t->type); + + return t; +} + +static int push_decl_tag_id(struct btf2btf_context *ctx, u32 decl_tag_id) +{ + u32 *arr = ctx->decl_tags; + u32 cap = ctx->max_decl_tags; + + if (ctx->nr_decl_tags + 1 > cap) { + cap = max(cap + 256, cap * 2); + arr = realloc(arr, sizeof(u32) * cap); + if (!arr) + return -ENOMEM; + ctx->max_decl_tags = cap; + ctx->decl_tags = arr; + } + + ctx->decl_tags[ctx->nr_decl_tags++] = decl_tag_id; + + return 0; +} + +static int push_kfunc(struct btf2btf_context *ctx, struct kfunc *kfunc) +{ + struct kfunc *arr = ctx->kfuncs; + u32 cap = ctx->max_kfuncs; + + if (ctx->nr_kfuncs + 1 > cap) { + cap = max(cap + 256, cap * 2); + arr = realloc(arr, sizeof(struct kfunc) * cap); + if (!arr) + return -ENOMEM; + ctx->max_kfuncs = cap; + ctx->kfuncs = arr; + } + + ctx->kfuncs[ctx->nr_kfuncs++] = *kfunc; + + return 0; +} + +static int collect_decl_tags(struct btf2btf_context *ctx) +{ + const u32 type_cnt = btf__type_cnt(ctx->btf); + struct btf *btf = ctx->btf; + const struct btf_type *t; + int err; + + for (u32 id = 1; id < type_cnt; id++) { + t = btf__type_by_id(btf, id); + if (!btf_is_decl_tag(t)) + continue; + err = push_decl_tag_id(ctx, id); + if (err) + return err; + } + + return 0; +} + +/* + * To find the kfunc flags having its struct btf_id (with ELF addresses) + * we need to find the address that is in range of a set8. + * If a set8 is found, then the flags are located at addr + 4 bytes. + * Return 0 (no flags!) if not found. + */ +static u32 find_kfunc_flags(struct object *obj, struct btf_id *kfunc_id) +{ + const u32 *elf_data_ptr = obj->efile.idlist->d_buf; + u64 set_lower_addr, set_upper_addr, addr; + struct btf_id *set_id; + struct rb_node *next; + u32 flags; + u64 idx; + + for (next = rb_first(&obj->sets); next; next = rb_next(next)) { + set_id = rb_entry(next, struct btf_id, rb_node); + if (set_id->kind != BTF_ID_KIND_SET8 || set_id->addr_cnt != 1) + continue; + + set_lower_addr = set_id->addr[0]; + set_upper_addr = set_lower_addr + set_id->cnt * sizeof(u64); + + for (u32 i = 0; i < kfunc_id->addr_cnt; i++) { + addr = kfunc_id->addr[i]; + /* + * Lower bound is exclusive to skip the 8-byte header of the set. + * Upper bound is inclusive to capture the last entry at offset 8*cnt. + */ + if (set_lower_addr < addr && addr <= set_upper_addr) { + pr_debug("found kfunc %s in BTF_ID_FLAGS %s\n", + kfunc_id->name, set_id->name); + idx = addr - obj->efile.idlist_addr; + idx = idx / sizeof(u32) + 1; + flags = elf_data_ptr[idx]; + + return flags; + } + } + } + + return 0; +} + +static int collect_kfuncs(struct object *obj, struct btf2btf_context *ctx) +{ + const char *tag_name, *func_name; + struct btf *btf = ctx->btf; + const struct btf_type *t; + u32 flags, func_id; + struct kfunc kfunc; + struct btf_id *id; + int err; + + if (ctx->nr_decl_tags == 0) + return 0; + + for (u32 i = 0; i < ctx->nr_decl_tags; i++) { + t = btf__type_by_id(btf, ctx->decl_tags[i]); + if (btf_kflag(t) || btf_decl_tag(t)->component_idx != -1) + continue; + + tag_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(tag_name, "bpf_kfunc") != 0) + continue; + + func_id = t->type; + t = btf__type_by_id(btf, func_id); + if (!btf_is_func(t)) + continue; + + func_name = btf__name_by_offset(btf, t->name_off); + if (!func_name) + continue; + + id = btf_id__find(&obj->funcs, func_name); + if (!id || id->kind != BTF_ID_KIND_SYM) + continue; + + flags = find_kfunc_flags(obj, id); + + kfunc.name = id->name; + kfunc.btf_id = func_id; + kfunc.flags = flags; + + err = push_kfunc(ctx, &kfunc); + if (err) + return err; + } + + return 0; +} + +static int build_btf2btf_context(struct object *obj, struct btf2btf_context *ctx) +{ + int err; + + ctx->btf = obj->btf; + + err = collect_decl_tags(ctx); + if (err) { + pr_err("ERROR: resolve_btfids: failed to collect decl tags from BTF\n"); + return err; + } + + err = collect_kfuncs(obj, ctx); + if (err) { + pr_err("ERROR: resolve_btfids: failed to collect kfuncs from BTF\n"); + return err; + } + + return 0; +} + + +/* Implicit BPF kfunc arguments can only be of particular types */ +static bool is_kf_implicit_arg(const struct btf *btf, const struct btf_param *p) +{ + static const char *const kf_implicit_arg_types[] = { + "bpf_prog_aux", + }; + const struct btf_type *t; + const char *name; + + t = btf_type_skip_qualifiers(btf, p->type); + if (!btf_is_ptr(t)) + return false; + + t = btf_type_skip_qualifiers(btf, t->type); + if (!btf_is_struct(t)) + return false; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + return false; + + for (int i = 0; i < ARRAY_SIZE(kf_implicit_arg_types); i++) + if (strcmp(name, kf_implicit_arg_types[i]) == 0) + return true; + + return false; +} + +/* + * For a kfunc with KF_IMPLICIT_ARGS we do the following: + * 1. Add a new function with _impl suffix in the name, with the prototype + * of the original kfunc. + * 2. Add all decl tags except "bpf_kfunc" for the _impl func. + * 3. Add a new function prototype with modified list of arguments: + * omitting implicit args. + * 4. Change the prototype of the original kfunc to the new one. + * + * This way we transform the BTF associated with the kfunc from + * __bpf_kfunc bpf_foo(int arg1, void *implicit_arg); + * into + * bpf_foo_impl(int arg1, void *implicit_arg); + * __bpf_kfunc bpf_foo(int arg1); + * + * If a kfunc with KF_IMPLICIT_ARGS already has an _impl counterpart + * in BTF, then it's a legacy case: an _impl function is declared in the + * source code. In this case, we can skip adding an _impl function, but we + * still have to add a func prototype that omits implicit args. + */ +static int process_kfunc_with_implicit_args(struct btf2btf_context *ctx, struct kfunc *kfunc) +{ + s32 idx, new_proto_id, new_func_id, proto_id; + const char *param_name, *tag_name; + const struct btf_param *params; + enum btf_func_linkage linkage; + char tmp_name[KSYM_NAME_LEN]; + struct btf *btf = ctx->btf; + int err, len, nr_params; + struct btf_type *t; + + t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id); + if (!t || !btf_is_func(t)) { + pr_err("ERROR: resolve_btfids: btf id %d is not a function\n", kfunc->btf_id); + return -EINVAL; + } + + linkage = btf_vlen(t); + + proto_id = t->type; + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + if (!t || !btf_is_func_proto(t)) { + pr_err("ERROR: resolve_btfids: btf id %d is not a function prototype\n", proto_id); + return -EINVAL; + } + + len = snprintf(tmp_name, sizeof(tmp_name), "%s%s", kfunc->name, KF_IMPL_SUFFIX); + if (len < 0 || len >= sizeof(tmp_name)) { + pr_err("ERROR: function name is too long: %s%s\n", kfunc->name, KF_IMPL_SUFFIX); + return -E2BIG; + } + + if (btf__find_by_name_kind(btf, tmp_name, BTF_KIND_FUNC) > 0) { + pr_debug("resolve_btfids: function %s already exists in BTF\n", tmp_name); + goto add_new_proto; + } + + /* Add a new function with _impl suffix and original prototype */ + new_func_id = btf__add_func(btf, tmp_name, linkage, proto_id); + if (new_func_id < 0) { + pr_err("ERROR: resolve_btfids: failed to add func %s to BTF\n", tmp_name); + return new_func_id; + } + + /* Copy all decl tags except "bpf_kfunc" from the original kfunc to the new one */ + for (int i = 0; i < ctx->nr_decl_tags; i++) { + t = (struct btf_type *)btf__type_by_id(btf, ctx->decl_tags[i]); + if (t->type != kfunc->btf_id) + continue; + + tag_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(tag_name, "bpf_kfunc") == 0) + continue; + + idx = btf_decl_tag(t)->component_idx; + + if (btf_kflag(t)) + err = btf__add_decl_attr(btf, tag_name, new_func_id, idx); + else + err = btf__add_decl_tag(btf, tag_name, new_func_id, idx); + + if (err < 0) { + pr_err("ERROR: resolve_btfids: failed to add decl tag %s for %s\n", + tag_name, tmp_name); + return -EINVAL; + } + } + +add_new_proto: + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + new_proto_id = btf__add_func_proto(btf, t->type); + if (new_proto_id < 0) { + pr_err("ERROR: resolve_btfids: failed to add func proto for %s\n", kfunc->name); + return new_proto_id; + } + + /* Add non-implicit args to the new prototype */ + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + nr_params = btf_vlen(t); + for (int i = 0; i < nr_params; i++) { + params = btf_params(t); + if (is_kf_implicit_arg(btf, ¶ms[i])) + break; + param_name = btf__name_by_offset(btf, params[i].name_off); + err = btf__add_func_param(btf, param_name, params[i].type); + if (err < 0) { + pr_err("ERROR: resolve_btfids: failed to add param %s for %s\n", + param_name, kfunc->name); + return err; + } + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + } + + /* Finally change the prototype of the original kfunc to the new one */ + t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id); + t->type = new_proto_id; + + pr_debug("resolve_btfids: updated BTF for kfunc with implicit args %s\n", kfunc->name); + + return 0; +} + +static int btf2btf(struct object *obj) +{ + struct btf2btf_context ctx = {}; + int err; + + err = build_btf2btf_context(obj, &ctx); + if (err) + goto out; + + for (u32 i = 0; i < ctx.nr_kfuncs; i++) { + struct kfunc *kfunc = &ctx.kfuncs[i]; + + if (!(kfunc->flags & KF_IMPLICIT_ARGS)) + continue; + + err = process_kfunc_with_implicit_args(&ctx, kfunc); + if (err) + goto out; + } + + err = 0; +out: + free(ctx.decl_tags); + free(ctx.kfuncs); + + return err; +} + /* * Sort types by name in ascending order resulting in all * anonymous types being placed before named types. @@ -1126,6 +1505,9 @@ int main(int argc, const char **argv) if (load_btf(&obj)) goto out; + if (btf2btf(&obj)) + goto out; + if (finalize_btf(&obj)) goto out; -- cgit v1.2.3 From e939f3d16d77a88e5f363394ef73db4c898c4107 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:31 -0800 Subject: selftests/bpf: Add tests for KF_IMPLICIT_ARGS Add trivial end-to-end tests to validate that KF_IMPLICIT_ARGS flag is properly handled by both resolve_btfids and the verifier. Declare kfuncs in bpf_testmod. Check that bpf_prog_aux pointer is set in the kfunc implementation. Verify that calls with implicit args and a legacy case all work. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-7-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/kfunc_implicit_args.c | 10 ++++++ .../selftests/bpf/progs/kfunc_implicit_args.c | 41 ++++++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 26 ++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_implicit_args.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c new file mode 100644 index 000000000000..5e4793c9c29a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include "kfunc_implicit_args.skel.h" + +void test_kfunc_implicit_args(void) +{ + RUN_TESTS(kfunc_implicit_args); +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c new file mode 100644 index 000000000000..89b6a47e22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +extern int bpf_kfunc_implicit_arg(int a) __weak __ksym; +extern int bpf_kfunc_implicit_arg_impl(int a, struct bpf_prog_aux *aux) __weak __ksym; /* illegal */ +extern int bpf_kfunc_implicit_arg_legacy(int a, int b) __weak __ksym; +extern int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) __weak __ksym; + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +__retval(5) +int test_kfunc_implicit_arg(void *ctx) +{ + return bpf_kfunc_implicit_arg(5); +} + +SEC("syscall") +__failure __msg("cannot find address for kernel function bpf_kfunc_implicit_arg_impl") +int test_kfunc_implicit_arg_impl_illegal(void *ctx) +{ + return bpf_kfunc_implicit_arg_impl(5, NULL); +} + +SEC("syscall") +__retval(7) +int test_kfunc_implicit_arg_legacy(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy(3, 4); +} + +SEC("syscall") +__retval(11) +int test_kfunc_implicit_arg_legacy_impl(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy_impl(5, 6, NULL); +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index bc07ce9d5477..a996b816ecc4 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1142,6 +1142,10 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); +__bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1184,6 +1188,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1675,6 +1682,25 @@ int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog return ret; } +int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux) +{ + if (aux && a > 0) + return a; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux) +{ + if (aux) + return a + b; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) +{ + return bpf_kfunc_implicit_arg_legacy(a, b, aux); +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = -- cgit v1.2.3 From b97931a25a4bc74076ffb5c3d1a534c71ade4d55 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:32 -0800 Subject: bpf: Migrate bpf_wq_set_callback_impl() to KF_IMPLICIT_ARGS Implement bpf_wq_set_callback() with an implicit bpf_prog_aux argument, and remove bpf_wq_set_callback_impl(). Update special kfunc checks in the verifier accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-8-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++------ kernel/bpf/verifier.c | 16 ++++++++-------- tools/testing/selftests/bpf/bpf_experimental.h | 5 ----- .../selftests/bpf/progs/verifier_async_cb_context.c | 4 ++-- tools/testing/selftests/bpf/progs/wq_failures.c | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9eaa4185e0a7..c76a9003b221 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3120,12 +3120,11 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) return 0; } -__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) +__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + struct bpf_prog_aux *aux) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -4488,7 +4487,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) -BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adc24a2ce5b6..51e8c9f70868 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -520,7 +520,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) @@ -562,7 +562,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn /* bpf_wq and bpf_task_work callbacks are always sleepable. */ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && - (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) return true; verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); @@ -12437,7 +12437,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, - KF_bpf_wq_set_callback_impl, + KF_bpf_wq_set_callback, KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, @@ -12501,7 +12501,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS @@ -12994,7 +12994,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + return is_bpf_wq_set_callback_kfunc(btf_id) || is_task_work_add_kfunc(btf_id); } @@ -13004,9 +13004,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn) insn->imm == special_kfunc_list[KF_bpf_throw]; } -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -14085,7 +14085,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.r0_rdonly = false; } - if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + if (is_bpf_wq_set_callback_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); if (err) { diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2cd9165c7348..68a49b1f77ae 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -580,11 +580,6 @@ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) struct bpf_iter_kmem_cache; extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym; diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 7efa9521105e..5d5e1cd4d51d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -96,7 +96,7 @@ int wq_non_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } @@ -114,7 +114,7 @@ int wq_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index d06f6d40594a..3767f5595bbc 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -97,7 +97,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("arg#0 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { @@ -123,7 +123,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") long test_wrong_wq_pointer_offset(void *ctx) { -- cgit v1.2.3 From 8157cc739ad301b7fb6dfc4cfc5497cedd33df4e Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:33 -0800 Subject: HID: Use bpf_wq_set_callback kernel function Remove extern declaration of bpf_wq_set_callback_impl() from hid_bpf_helpers.h and replace bpf_wq_set_callback macro with a corresponding new declaration. Tested with: # append tools/testing/selftests/hid/config and build the kernel $ make -C tools/testing/selftests/hid # in built kernel $ ./tools/testing/selftests/hid/hid_bpf -t test_multiply_events_wq TAP version 13 1..1 # Starting 1 tests from 1 test cases. # RUN hid_bpf.test_multiply_events_wq ... [ 2.575520] hid-generic 0003:0001:0A36.0001: hidraw0: USB HID v0.00 Device [test-uhid-device-138] on 138 # OK hid_bpf.test_multiply_events_wq ok 1 hid_bpf.test_multiply_events_wq # PASSED: 1 / 1 tests passed. # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 PASS Acked-by: Benjamin Tissoires Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-9-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- drivers/hid/bpf/progs/hid_bpf_helpers.h | 8 +++----- tools/testing/selftests/hid/progs/hid_bpf_helpers.h | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/drivers/hid/bpf/progs/hid_bpf_helpers.h b/drivers/hid/bpf/progs/hid_bpf_helpers.h index bf19785a6b06..228f8d787567 100644 --- a/drivers/hid/bpf/progs/hid_bpf_helpers.h +++ b/drivers/hid/bpf/progs/hid_bpf_helpers.h @@ -33,11 +33,9 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(wq, cb, flags) \ - bpf_wq_set_callback_impl(wq, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #define HID_MAX_DESCRIPTOR_SIZE 4096 #define HID_IGNORE_EVENT -1 diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 531228b849da..80ab60905865 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -116,10 +116,8 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *wq), - unsigned int flags__k, void *aux__ign) __weak __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #endif /* __HID_BPF_HELPERS_H */ -- cgit v1.2.3 From 6e663ffdf7600168338fdfa2fd1eed83395d58a3 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:34 -0800 Subject: bpf: Migrate bpf_task_work_schedule_* kfuncs to KF_IMPLICIT_ARGS Implement bpf_task_work_schedule_* with an implicit bpf_prog_aux argument, and remove corresponding _impl funcs from the kernel. Update special kfunc checks in the verifier accordingly. Update the selftests to use the new API with implicit argument. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-10-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 ++++++++++------------ kernel/bpf/verifier.c | 12 ++++----- tools/testing/selftests/bpf/progs/file_reader.c | 2 +- tools/testing/selftests/bpf/progs/task_work.c | 7 +++-- tools/testing/selftests/bpf/progs/task_work_fail.c | 8 +++--- .../testing/selftests/bpf/progs/task_work_stress.c | 4 +-- .../bpf/progs/verifier_async_cb_context.c | 4 +-- 7 files changed, 32 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c76a9003b221..f2f974b5fb3b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4274,41 +4274,39 @@ release_prog: } /** - * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, @@ -4536,8 +4534,8 @@ BTF_ID_FLAGS(func, bpf_strncasestr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51e8c9f70868..8e8570e9d167 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12457,8 +12457,8 @@ enum special_kfunc_type { KF_bpf_dynptr_from_file, KF_bpf_dynptr_file_discard, KF___bpf_trap, - KF_bpf_task_work_schedule_signal_impl, - KF_bpf_task_work_schedule_resume_impl, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, @@ -12534,16 +12534,16 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, bpf_dynptr_from_file) BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal_impl) -BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 4d756b623557..462712ff3b8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -77,7 +77,7 @@ int on_open_validate_file_read(void *c) err = 1; return 0; } - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 663a80990f8f..a6009d105158 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -65,8 +65,7 @@ int oncpu_hash_map(struct pt_regs *args) work = bpf_map_lookup_elem(&hmap, &key); if (!work) return 0; - - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -80,7 +79,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work); return 0; } @@ -102,6 +101,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 1270953fd092..82e4b8913333 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 55e555f7f41b..1d4378f351ef 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 5d5e1cd4d51d..39aff82549c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -156,7 +156,7 @@ int task_work_non_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } @@ -176,6 +176,6 @@ int task_work_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } -- cgit v1.2.3 From d806f3101276a1ed18d963944580e1ee1c7a3d26 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:35 -0800 Subject: bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument, and remote bpf_stream_vprintk_impl from the kernel. Update the selftests to use the new API with implicit argument. bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk kfunc, and the extern definition of bpf_stream_vprintk_impl is replaced accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/stream.c | 5 ++--- tools/lib/bpf/bpf_helpers.h | 6 +++--- tools/testing/selftests/bpf/progs/stream_fail.c | 6 +++--- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f2f974b5fb3b..f8aa1320e2f7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4533,7 +4533,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 0b6bc3f30335..24730df55e69 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -212,14 +212,13 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, struct bpf_prog_aux *aux) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; - struct bpf_prog_aux *aux = aux__prog; u32 fmt_size = strlen(fmt__str) + 1; struct bpf_stream *stream; u32 data_len = len__sz; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index d4e4e388e625..c145da05a67c 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; +extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; #define bpf_stream_printk(stream_id, fmt, args...) \ ({ \ @@ -328,7 +328,7 @@ extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const vo ___bpf_fill(___param, args); \ _Pragma("GCC diagnostic pop") \ \ - bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ + bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 3662515f0107..8e8249f3521c 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); return 0; } -- cgit v1.2.3 From bd06b977e02d80fe0dec303cc219d007121a4526 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:36 -0800 Subject: selftests/bpf: Migrate struct_ops_assoc test to KF_IMPLICIT_ARGS A test kfunc named bpf_kfunc_multi_st_ops_test_1_impl() is a user of __prog suffix. Subsequent patch removes __prog support in favor of KF_IMPLICIT_ARGS, so migrate this kfunc to use implicit argument. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-12-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/struct_ops_assoc.c | 8 ++++---- tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c | 4 ++-- tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c | 6 +++--- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 9 ++++----- tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 6 ++++-- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c index 8f1097903e22..68842e3f936b 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -32,7 +32,7 @@ int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) if (!test_pid || task->pid != test_pid) return 0; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -45,7 +45,7 @@ int syscall_prog_a(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -79,7 +79,7 @@ int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) if (!test_pid || task->pid != test_pid) return 0; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_B_MAGIC) test_err_b++; @@ -92,7 +92,7 @@ int syscall_prog_b(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_B_MAGIC) test_err_b++; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c index d5a2ea934284..0bed49e9f217 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -31,7 +31,7 @@ __noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) struct st_ops_args args = {}; recur++; - timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); recur--; timer_cb_run++; @@ -64,7 +64,7 @@ int syscall_prog(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_MAGIC) test_err++; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c index 5bb6ebf5eed4..396b3e58c729 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -23,7 +23,7 @@ int BPF_PROG(test_1_a, struct st_ops_args *args) if (!recur) { recur++; - ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(args); if (ret != -1) test_err_a++; recur--; @@ -40,7 +40,7 @@ int syscall_prog_a(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -62,7 +62,7 @@ int syscall_prog_b(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_b++; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index a996b816ecc4..0d542ba64365 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1140,7 +1140,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); -__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); @@ -1187,7 +1187,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) @@ -1669,13 +1669,12 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) } /* Call test_1() of the associated struct_ops map */ -int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) +int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux) { - struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog; struct bpf_testmod_multi_st_ops *st_ops; int ret = -1; - st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux); + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(aux); if (st_ops) ret = st_ops->test_1(args); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 2357a0340ffe..225ea30c4e3d 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -161,7 +161,9 @@ void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; -int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; -int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym; +#ifndef __KERNEL__ +extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __weak __ksym; +extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; +#endif #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 44fdd581d27366092e162b42f025d75d5a16c851 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:57 +0800 Subject: bpf: Add range tracking for BPF_DIV and BPF_MOD This patch implements range tracking (interval analysis) for BPF_DIV and BPF_MOD operations when the divisor is a constant, covering both signed and unsigned variants. While LLVM typically optimizes integer division and modulo by constants into multiplication and shift sequences, this optimization is less effective for the BPF target when dealing with 64-bit arithmetic. Currently, the verifier does not track bounds for scalar division or modulo, treating the result as "unbounded". This leads to false positive rejections for safe code patterns. For example, the following code (compiled with -O2): ```c int test(struct pt_regs *ctx) { char buffer[6] = {1}; __u64 x = bpf_ktime_get_ns(); __u64 res = x % sizeof(buffer); char value = buffer[res]; bpf_printk("res = %llu, val = %d", res, value); return 0; } ``` Generates a raw `BPF_MOD64` instruction: ```asm ; __u64 res = x % sizeof(buffer); 1: 97 00 00 00 06 00 00 00 r0 %= 0x6 ; char value = buffer[res]; 2: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 4: 0f 01 00 00 00 00 00 00 r1 += r0 5: 91 14 00 00 00 00 00 00 r4 = *(s8 *)(r1 + 0x0) ``` Without this patch, the verifier fails with "math between map_value pointer and register with unbounded min value is not allowed" because it cannot deduce that `r0` is within [0, 5]. According to the BPF instruction set[1], the instruction's offset field (`insn->off`) is used to distinguish between signed (`off == 1`) and unsigned division (`off == 0`). Moreover, we also follow the BPF division and modulo runtime behavior (semantics) to handle special cases, such as division by zero and signed division overflow. - UDIV: dst = (src != 0) ? (dst / src) : 0 - SDIV: dst = (src == 0) ? 0 : ((src == -1 && dst == LLONG_MIN) ? LLONG_MIN : (dst / src)) - UMOD: dst = (src != 0) ? (dst % src) : dst - SMOD: dst = (src == 0) ? dst : ((src == -1 && dst == LLONG_MIN) ? 0: (dst s% src)) Here is the overview of the changes made in this patch (See the code comments for more details and examples): 1. For BPF_DIV: Firstly check whether the divisor is zero. If so, set the destination register to zero (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)div` functions. - General cases: compute the new range by dividing max_dividend and min_dividend by the constant divisor. - Overflow case (SIGNED_MIN / -1) in signed division: mark the result as unbounded if the dividend is not a single number. 2. For BPF_MOD: Firstly check whether the divisor is zero. If so, leave the destination register unchanged (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)mod` functions. - General case: For signed modulo, the result's sign matches the dividend's sign. And the result's absolute value is strictly bounded by `min(abs(dividend), abs(divisor) - 1)`. - Special care is taken when the divisor is SIGNED_MIN. By casting to unsigned before negation and subtracting 1, we avoid signed overflow and correctly calculate the maximum possible magnitude (`res_max_abs` in the code). - "Small dividend" case: If the dividend is already within the possible result range (e.g., [-2, 5] % 10), the operation is an identity function, and the destination register remains unchanged. 3. In `scalar(32)?_min_max_(u|s)(div|mod)` functions: After updating current range, reset other ranges and tnum to unbounded/unknown. e.g., in `scalar_min_max_sdiv`, signed 64-bit range is updated. Then reset unsigned 64-bit range and 32-bit range to unbounded, and tnum to unknown. Exception: in BPF_MOD's "small dividend" case, since the result remains unchanged, we do not reset other ranges/tnum. 4. Also updated existing selftests based on the expected BPF_DIV and BPF_MOD behavior. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Tested-by: syzbot@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20260119085458.182221-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 299 +++++++++++++++++++++ .../bpf/progs/verifier_value_illegal_alu.c | 7 +- tools/testing/selftests/bpf/verifier/precise.c | 4 +- 3 files changed, 305 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 919556614505..f11dc5366e5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2349,6 +2349,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg) reg->u32_max_value = U32_MAX; } +static void reset_reg64_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg64_unbounded(reg); + reg->var_off = tnum_unknown; +} + +static void reset_reg32_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + reg->var_off = tnum_unknown; +} + static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); @@ -15159,6 +15171,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, } } +static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + + *dst_umin = *dst_umin / src_val; + *dst_umax = *dst_umax / src_val; + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + + *dst_umin = div64_u64(*dst_umin, src_val); + *dst_umax = div64_u64(*dst_umax, src_val); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 res1, res2; + + /* BPF div specification: S32_MIN / -1 = S32_MIN */ + if (*dst_smin == S32_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S32_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S32_MIN, S32_MIN+10]/(-1), + * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)] + * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] + * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. + */ + if (*dst_smax != S32_MIN) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; + } + goto reset; + } + + res1 = *dst_smin / src_val; + res2 = *dst_smax / src_val; + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 res1, res2; + + /* BPF div specification: S64_MIN / -1 = S64_MIN */ + if (*dst_smin == S64_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S64_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S64_MIN, S64_MIN+10]/(-1), + * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)] + * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] + * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. + */ + if (*dst_smax != S64_MIN) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; + } + goto reset; + } + + res1 = div64_s64(*dst_smin, src_val); + res2 = div64_s64(*dst_smax, src_val); + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648. + * Here use unsigned integer to avoid overflow. + */ + u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives + * 2147483647 (S32_MAX), which fits perfectly in s32. + */ + s32 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S64_MIN (-2^63), src_abs becomes 2^63. + * Here use unsigned integer to avoid overflow. + */ + u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives + * 2^63 - 1 (S64_MAX), which fits perfectly in s64. + */ + s64 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { @@ -15564,6 +15822,14 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_MUL: return true; + /* + * Division and modulo operators range is only safe to compute when the + * divisor is a constant. + */ + case BPF_DIV: + case BPF_MOD: + return src_is_const; + /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number. @@ -15616,6 +15882,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); + s16 off = insn->off; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; @@ -15667,6 +15934,38 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_mul(dst_reg, &src_reg); scalar_min_max_mul(dst_reg, &src_reg); break; + case BPF_DIV: + /* BPF div specification: x / 0 = 0 */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + ___mark_reg_known(dst_reg, 0); + break; + } + if (alu32) + if (off == 1) + scalar32_min_max_sdiv(dst_reg, &src_reg); + else + scalar32_min_max_udiv(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_sdiv(dst_reg, &src_reg); + else + scalar_min_max_udiv(dst_reg, &src_reg); + break; + case BPF_MOD: + /* BPF mod specification: x % 0 = x */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + break; + if (alu32) + if (off == 1) + scalar32_min_max_smod(dst_reg, &src_reg); + else + scalar32_min_max_umod(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_smod(dst_reg, &src_reg); + else + scalar_min_max_umod(dst_reg, &src_reg); + break; case BPF_AND: if (tnum_is_const(src_reg.var_off)) { ret = maybe_fork_scalars(env, insn, dst_reg); diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 2129e4353fd9..4d8273c258d5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -173,14 +173,15 @@ __naked void flow_keys_illegal_variable_offset_alu(void) asm volatile(" \ r6 = r1; \ r7 = *(u64*)(r6 + %[flow_keys_off]); \ - r8 = 8; \ - r8 /= 1; \ + call %[bpf_get_prandom_u32]; \ + r8 = r0; \ r8 &= 8; \ r7 += r8; \ r0 = *(u64*)(r7 + 0); \ exit; \ " : - : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) : __clobber_all); } diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 59a020c35647..061d98f6e9bb 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -229,11 +229,11 @@ { "precise: program doesn't prematurely prune branches", .insns = { - BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000), - BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401), BPF_JMP_IMM(BPF_JA, 0, 0, 0), BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2), BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1), -- cgit v1.2.3 From c9e440bf25a712d906c40ba3ef831f3f0ccc6a1b Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:58 +0800 Subject: selftests/bpf: Add tests for BPF_DIV and BPF_MOD range tracking Now BPF_DIV has range tracking support via interval analysis. This patch adds selftests to cover various cases of BPF_DIV and BPF_MOD operations when the divisor is a constant, also covering both signed and unsigned variants. This patch includes several types of tests in 32-bit and 64-bit variants: 1. For UDIV - positive divisor - zero divisor 2. For SDIV - positive divisor, positive dividend - positive divisor, negative dividend - positive divisor, mixed sign dividend - negative divisor, positive dividend - negative divisor, negative dividend - negative divisor, mixed sign dividend - zero divisor - overflow (SIGNED_MIN/-1), normal dividend - overflow (SIGNED_MIN/-1), constant dividend 3. For UMOD - positive divisor - positive divisor, small dividend - zero divisor 4. For SMOD - positive divisor, positive dividend - positive divisor, negative dividend - positive divisor, mixed sign dividend - positive divisor, mixed sign dividend, small dividend - negative divisor, positive dividend - negative divisor, negative dividend - negative divisor, mixed sign dividend - negative divisor, mixed sign dividend, small dividend - zero divisor - overflow (SIGNED_MIN/-1), normal dividend - overflow (SIGNED_MIN/-1), constant dividend Specifically, these selftests are based on dead code elimination: If the BPF verifier can precisely analyze the result of BPF_DIV/BPF_MOD instruction, it can prune the path that leads to an error (here we use invalid memory access as the error case), allowing the program to pass verification. Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260119085458.182221-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_div_mod_bounds.c | 1149 ++++++++++++++++++++ 2 files changed, 1151 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 38c5ba70100c..fa9e506cc36f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -33,6 +33,7 @@ #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" +#include "verifier_div_mod_bounds.skel.h" #include "verifier_div_overflow.skel.h" #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" @@ -175,6 +176,7 @@ void test_verifier_d_path(void) { RUN(verifier_d_path); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } +void test_verifier_div_mod_bounds(void) { RUN(verifier_div_mod_bounds); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } diff --git a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c new file mode 100644 index 000000000000..4672af0b3268 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c @@ -0,0 +1,1149 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf_misc.h" + +/* This file contains unit tests for signed/unsigned division and modulo + * operations (with divisor as a constant), focusing on verifying whether + * BPF verifier's range tracking module soundly and precisely computes + * the results. + */ + +SEC("socket") +__description("UDIV32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 /= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= w2 {{.*}}; R1=0 R2=0") +__naked void udiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 /= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 /= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= r2 {{.*}}; R1=0 R2=0") +__naked void udiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 /= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=3,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-3,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s/= w2 {{.*}}; R1=0 R2=0") +__naked void sdiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 s/= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=0x80000000") +__naked void sdiv32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s/= -1; \ + if w1 != %[int_min] goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=3)") +__naked void sdiv64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=2)") +__naked void sdiv64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s/= r2 {{.*}}; R1=0 R2=0") +__naked void sdiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 s/= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=scalar()") +__naked void sdiv64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=0x8000000000000000") +__naked void sdiv64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s/= -1; \ + r2 = %[llong_min] ll; \ + if r1 != r2 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 10; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 %%= w2; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 10; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 %%= r2; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff)) R2=0") +__naked void smod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w2 = 0; \ + w1 s%%= w2; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s%%= -1; \ + if w1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s%%= -1; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10) R2=0") +__naked void smod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r2 = 0; \ + r1 s%%= r2; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s%%= -1; \ + if r1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s%%= -1; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} -- cgit v1.2.3 From a917cd0a23fae160a85b0e8a0dd1d548c5d5242e Mon Sep 17 00:00:00 2001 From: Michel Lind Date: Fri, 16 Jan 2026 21:21:58 +0000 Subject: tools/net/ynl: Makefile's install target now installs ynltool This tool is built by default, but was not being installed by default when running `make install`. Fix this by calling ynltool's install target. Signed-off-by: Michel Lind Link: https://patch.msgid.link/aWqr9gUT4hWZwwcI@mbp-m3-fedora.vm Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile index c2f3e8b3f2ac..9b692f368be7 100644 --- a/tools/net/ynl/Makefile +++ b/tools/net/ynl/Makefile @@ -41,7 +41,7 @@ clean distclean: rm -rf pyynl.egg-info rm -rf build -install: libynl.a lib/*.h +install: libynl.a lib/*.h ynltool @echo -e "\tINSTALL libynl.a" @$(INSTALL) -d $(DESTDIR)$(libdir) @$(INSTALL) -m 0644 libynl.a $(DESTDIR)$(libdir)/libynl.a @@ -51,6 +51,7 @@ install: libynl.a lib/*.h @echo -e "\tINSTALL pyynl" @pip install --prefix=$(DESTDIR)$(prefix) . @make -C generated install + @make -C ynltool install run_tests: @$(MAKE) -C tests run_tests -- cgit v1.2.3 From dd341eacdba360d035c9d4de66d3c80a89d77c84 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 20 Jan 2026 09:16:30 +0000 Subject: selftests/bpf: update verifier test for default trusted pointer semantics Replace the verifier test for default trusted pointer semantics, which previously relied on BPF kfunc bpf_get_root_mem_cgroup(), with a new test utilizing dedicated BPF kfuncs defined within the bpf_testmod. bpf_get_root_mem_cgroup() was modified such that it again relies on KF_ACQUIRE semantics, therefore no longer making it a suitable candidate to test BPF verifier default trusted pointer semantics against. Link: https://lore.kernel.org/bpf/20260113083949.2502978-2-mattbobrowski@google.com Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260120091630.3420452-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 4 +-- .../bpf/progs/verifier_default_trusted_ptr.c | 29 ++++++++++++++++++++ .../selftests/bpf/progs/verifier_memcontrol.c | 32 ---------------------- .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 18 ++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 3 ++ 5 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c delete mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index fa9e506cc36f..b6a1e79709be 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -30,6 +30,7 @@ #include "verifier_ctx.skel.h" #include "verifier_ctx_sk_msg.skel.h" #include "verifier_d_path.skel.h" +#include "verifier_default_trusted_ptr.skel.h" #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" @@ -62,7 +63,6 @@ #include "verifier_masking.skel.h" #include "verifier_may_goto_1.skel.h" #include "verifier_may_goto_2.skel.h" -#include "verifier_memcontrol.skel.h" #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" @@ -173,6 +173,7 @@ void test_verifier_const_or(void) { RUN(verifier_const_or); } void test_verifier_ctx(void) { RUN(verifier_ctx); } void test_verifier_ctx_sk_msg(void) { RUN(verifier_ctx_sk_msg); } void test_verifier_d_path(void) { RUN(verifier_d_path); } +void test_verifier_default_trusted_ptr(void) { RUN_TESTS(verifier_default_trusted_ptr); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } @@ -205,7 +206,6 @@ void test_verifier_map_ret_val(void) { RUN(verifier_map_ret_val); } void test_verifier_masking(void) { RUN(verifier_masking); } void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } -void test_verifier_memcontrol(void) { RUN(verifier_memcontrol); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_mul(void) { RUN(verifier_mul); } diff --git a/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c new file mode 100644 index 000000000000..fa3b656ad4fb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +SEC("syscall") +__success __retval(0) +int test_default_trusted_ptr(void *ctx) +{ + struct prog_test_member *trusted_ptr; + + trusted_ptr = bpf_kfunc_get_default_trusted_ptr_test(); + /* + * Test BPF kfunc bpf_get_default_trusted_ptr_test() returns a + * PTR_TO_BTF_ID | PTR_TRUSTED, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + bpf_kfunc_put_default_trusted_ptr_test(trusted_ptr); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c deleted file mode 100644 index 13564956f621..000000000000 --- a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2026 Google LLC. - */ - -#include -#include -#include -#include "bpf_misc.h" - -SEC("syscall") -__success __retval(0) -int root_mem_cgroup_default_trusted(void *ctx) -{ - unsigned long usage; - struct mem_cgroup *root_mem_cgroup; - - root_mem_cgroup = bpf_get_root_mem_cgroup(); - if (!root_mem_cgroup) - return 1; - - /* - * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID | - * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when - * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. - */ - usage = bpf_mem_cgroup_usage(root_mem_cgroup); - __sink(usage); - return 0; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 0d542ba64365..d425034b72d3 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -254,6 +254,22 @@ __bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) return NULL; } +static struct prog_test_member trusted_ptr; + +__bpf_kfunc struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) +{ + return &trusted_ptr; +} + +__bpf_kfunc void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) +{ + /* + * This BPF kfunc doesn't actually have any put/KF_ACQUIRE + * semantics. We're simply wanting to simulate a BPF kfunc that takes a + * struct prog_test_member pointer as an argument. + */ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -709,6 +725,8 @@ BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2) +BTF_ID_FLAGS(func, bpf_kfunc_get_default_trusted_ptr_test); +BTF_ID_FLAGS(func, bpf_kfunc_put_default_trusted_ptr_test); BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 225ea30c4e3d..10f89f06245f 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -166,4 +166,7 @@ extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __wea extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; #endif +struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; +void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 8766d61a1d33cb5f15bfdd6ce9832bbe1fc649c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Jan 2026 18:04:55 -0800 Subject: Revert "Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'" This reverts commit 77b9c4a438fc66e2ab004c411056b3fb71a54f2c, reversing changes made to 4515ec4ad58a37e70a9e1256c0b993958c9b7497: 931420a2fc36 ("selftests/net: Add netkit container tests") ab771c938d9a ("selftests/net: Make NetDrvContEnv support queue leasing") 6be87fbb2776 ("selftests/net: Add env for container based tests") 61d99ce3dfc2 ("selftests/net: Add bpf skb forwarding program") 920da3634194 ("netkit: Add xsk support for af_xdp applications") eef51113f8af ("netkit: Add netkit notifier to check for unregistering devices") b5ef109d22d4 ("netkit: Implement rtnl_link_ops->alloc and ndo_queue_create") b5c3fa4a0b16 ("netkit: Add single device mode for netkit") 0073d2fd679d ("xsk: Proxy pool management for leased queues") 1ecea95dd3b5 ("xsk: Extend xsk_rcv_check validation") 804bf334d08a ("net: Proxy netdev_queue_get_dma_dev for leased queues") 0caa9a8ddec3 ("net: Proxy net_mp_{open,close}_rxq for leased queues") ff8889ff9107 ("net, ethtool: Disallow leased real rxqs to be resized") 9e2103f36110 ("net: Add lease info to queue-get response") 31127deddef4 ("net: Implement netdev_nl_queue_create_doit") a5546e18f77c ("net: Add queue-create operation") The series will conflict with io_uring work, and the code needs more polish. Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 44 --- drivers/net/netkit.c | 360 ++++----------------- include/linux/netdevice.h | 6 - include/net/netdev_queues.h | 19 +- include/net/netdev_rx_queue.h | 21 +- include/net/page_pool/memory_provider.h | 4 +- include/net/xdp_sock_drv.h | 2 +- include/uapi/linux/if_link.h | 6 - include/uapi/linux/netdev.h | 11 - net/core/dev.c | 7 - net/core/dev.h | 2 - net/core/netdev-genl-gen.c | 20 -- net/core/netdev-genl-gen.h | 2 - net/core/netdev-genl.c | 185 ----------- net/core/netdev_queues.c | 74 +---- net/core/netdev_rx_queue.c | 169 ++-------- net/ethtool/channels.c | 12 +- net/ethtool/ioctl.c | 9 +- net/xdp/xsk.c | 79 +---- tools/include/uapi/linux/netdev.h | 11 - tools/testing/selftests/drivers/net/README.rst | 7 - tools/testing/selftests/drivers/net/hw/Makefile | 2 - .../selftests/drivers/net/hw/lib/py/__init__.py | 7 +- .../selftests/drivers/net/hw/nk_forward.bpf.c | 49 --- tools/testing/selftests/drivers/net/hw/nk_netns.py | 23 -- .../testing/selftests/drivers/net/hw/nk_qlease.py | 55 ---- .../selftests/drivers/net/lib/py/__init__.py | 7 +- tools/testing/selftests/drivers/net/lib/py/env.py | 157 --------- 28 files changed, 117 insertions(+), 1233 deletions(-) delete mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py (limited to 'tools') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index b86db8656eac..596c306ce52b 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -339,15 +339,6 @@ attribute-sets: doc: XSK information for this queue, if any. type: nest nested-attributes: xsk-info - - - name: lease - doc: | - A queue from a virtual device can have a lease which refers to - another queue from a physical device. This is useful for memory - providers and AF_XDP operations which take an ifindex and queue id - to allow applications to bind against virtual devices in containers. - type: nest - nested-attributes: lease - name: qstats doc: | @@ -546,24 +537,6 @@ attribute-sets: name: id - name: type - - - name: lease - attributes: - - - name: ifindex - doc: The netdev ifindex to lease the queue from. - type: u32 - checks: - min: 1 - - - name: queue - doc: The netdev queue to lease from. - type: nest - nested-attributes: queue-id - - - name: netns-id - doc: The network namespace id of the netdev. - type: s32 - name: dmabuf attributes: @@ -713,7 +686,6 @@ operations: - dmabuf - io-uring - xsk - - lease dump: request: attributes: @@ -825,22 +797,6 @@ operations: reply: attributes: - id - - - name: queue-create - doc: | - Create a new queue for the given netdevice. Whether this operation - is supported depends on the device and the driver. - attribute-set: queue - flags: [admin-perm] - do: - request: - attributes: - - ifindex - - type - - lease - reply: &queue-create-op - attributes: - - id kernel-family: headers: ["net/netdev_netlink.h"] diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 0519f855d062..0a2fef7caccb 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -9,21 +9,11 @@ #include #include -#include -#include -#include -#include #include #include #include -#define NETKIT_DRV_NAME "netkit" - -#define NETKIT_NUM_RX_QUEUES_MAX 1024 -#define NETKIT_NUM_TX_QUEUES_MAX 1 - -#define NETKIT_NUM_RX_QUEUES_REAL 1 -#define NETKIT_NUM_TX_QUEUES_REAL 1 +#define DRV_NAME "netkit" struct netkit { __cacheline_group_begin(netkit_fastpath); @@ -36,7 +26,6 @@ struct netkit { __cacheline_group_begin(netkit_slowpath); enum netkit_mode mode; - enum netkit_pairing pair; bool primary; u32 headroom; __cacheline_group_end(netkit_slowpath); @@ -47,8 +36,6 @@ struct netkit_link { struct net_device *dev; }; -static struct rtnl_link_ops netkit_link_ops; - static __always_inline int netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, enum netkit_action ret) @@ -148,10 +135,6 @@ static int netkit_open(struct net_device *dev) struct netkit *nk = netkit_priv(dev); struct net_device *peer = rtnl_dereference(nk->peer); - if (nk->pair == NETKIT_DEVICE_SINGLE) { - netif_carrier_on(dev); - return 0; - } if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { @@ -236,86 +219,9 @@ static void netkit_get_stats(struct net_device *dev, stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); } -static bool netkit_xsk_supported_at_phys(const struct net_device *dev) -{ - if (!dev->netdev_ops->ndo_bpf || - !dev->netdev_ops->ndo_xdp_xmit || - !dev->netdev_ops->ndo_xsk_wakeup) - return false; - if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) - return false; - return true; -} - -static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp) -{ - struct netkit *nk = netkit_priv(dev); - struct netdev_bpf xdp_lower; - struct netdev_rx_queue *rxq; - struct net_device *phys; - int ret = -EBUSY; - - switch (xdp->command) { - case XDP_SETUP_XSK_POOL: - if (nk->pair == NETKIT_DEVICE_PAIR) - return -EOPNOTSUPP; - if (xdp->xsk.queue_id >= dev->real_num_rx_queues) - return -EINVAL; - - rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id); - if (!rxq->lease) - return -EOPNOTSUPP; - - phys = rxq->lease->dev; - if (!netkit_xsk_supported_at_phys(phys)) - return -EOPNOTSUPP; - - memcpy(&xdp_lower, xdp, sizeof(xdp_lower)); - xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease); - break; - case XDP_SETUP_PROG: - return -EPERM; - default: - return -EINVAL; - } - - netdev_lock(phys); - if (!dev_get_min_mp_channel_count(phys)) - ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower); - netdev_unlock(phys); - return ret; -} - -static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) -{ - struct netdev_rx_queue *rxq; - struct net_device *phys; - - if (queue_id >= dev->real_num_rx_queues) - return -EINVAL; - - rxq = __netif_get_rx_queue(dev, queue_id); - if (!rxq->lease) - return -EOPNOTSUPP; - - phys = rxq->lease->dev; - if (!netkit_xsk_supported_at_phys(phys)) - return -EOPNOTSUPP; - - return phys->netdev_ops->ndo_xsk_wakeup(phys, - get_netdev_rx_queue_index(rxq->lease), flags); -} - -static int netkit_init(struct net_device *dev) -{ - netdev_lockdep_set_classes(dev); - return 0; -} - static void netkit_uninit(struct net_device *dev); static const struct net_device_ops netkit_netdev_ops = { - .ndo_init = netkit_init, .ndo_open = netkit_open, .ndo_stop = netkit_close, .ndo_start_xmit = netkit_xmit, @@ -326,95 +232,19 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_get_peer_dev = netkit_peer_dev, .ndo_get_stats64 = netkit_get_stats, .ndo_uninit = netkit_uninit, - .ndo_bpf = netkit_xsk, - .ndo_xsk_wakeup = netkit_xsk_wakeup, .ndo_features_check = passthru_features_check, }; static void netkit_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); } static const struct ethtool_ops netkit_ethtool_ops = { .get_drvinfo = netkit_get_drvinfo, }; -static int netkit_queue_create(struct net_device *dev) -{ - struct netkit *nk = netkit_priv(dev); - u32 rxq_count_old, rxq_count_new; - int err; - - rxq_count_old = dev->real_num_rx_queues; - rxq_count_new = rxq_count_old + 1; - - /* Only allow to lease a queue in single device mode or to - * lease against the peer device which then ends up in the - * target netns. - */ - if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) - return -EOPNOTSUPP; - - if (netif_running(dev)) - netif_carrier_off(dev); - err = netif_set_real_num_rx_queues(dev, rxq_count_new); - if (netif_running(dev)) - netif_carrier_on(dev); - - return err ? : rxq_count_old; -} - -static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = { - .ndo_queue_create = netkit_queue_create, -}; - -static struct net_device *netkit_alloc(struct nlattr *tb[], - const char *ifname, - unsigned char name_assign_type, - unsigned int num_tx_queues, - unsigned int num_rx_queues) -{ - const struct rtnl_link_ops *ops = &netkit_link_ops; - struct net_device *dev; - - if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX || - num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX) - return ERR_PTR(-EOPNOTSUPP); - - dev = alloc_netdev_mqs(ops->priv_size, ifname, - name_assign_type, ops->setup, - num_tx_queues, num_rx_queues); - if (dev) { - dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL; - dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL; - } - return dev; -} - -static void netkit_queue_unlease(struct net_device *dev) -{ - struct netdev_rx_queue *rxq, *rxq_lease; - struct net_device *dev_lease; - int i; - - if (dev->real_num_rx_queues == 1) - return; - - netdev_lock(dev); - for (i = 1; i < dev->real_num_rx_queues; i++) { - rxq = __netif_get_rx_queue(dev, i); - rxq_lease = rxq->lease; - dev_lease = rxq_lease->dev; - - netdev_lock(dev_lease); - netdev_rx_queue_unlease(rxq, rxq_lease); - netdev_unlock(dev_lease); - } - netdev_unlock(dev); -} - static void netkit_setup(struct net_device *dev) { static const netdev_features_t netkit_features_hw_vlan = @@ -445,20 +275,18 @@ static void netkit_setup(struct net_device *dev) dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; - dev->netdev_ops = &netkit_netdev_ops; - dev->ethtool_ops = &netkit_ethtool_ops; - dev->queue_mgmt_ops = &netkit_queue_mgmt_ops; + dev->ethtool_ops = &netkit_ethtool_ops; + dev->netdev_ops = &netkit_netdev_ops; dev->features |= netkit_features; dev->hw_features = netkit_features; dev->hw_enc_features = netkit_features; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features & ~netkit_features_hw_vlan; + dev->needs_free_netdev = true; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - - xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK); } static struct net *netkit_get_link_net(const struct net_device *dev) @@ -497,6 +325,8 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } +static struct rtnl_link_ops netkit_link_ops; + static int netkit_new_link(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) @@ -505,7 +335,6 @@ static int netkit_new_link(struct net_device *dev, enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; - enum netkit_pairing pair = NETKIT_DEVICE_PAIR; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; struct nlattr **data = params->data; @@ -514,8 +343,7 @@ static int netkit_new_link(struct net_device *dev, struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; - struct net_device *peer = NULL; - bool seen_peer = false; + struct net_device *peer; char ifname[IFNAMSIZ]; struct netkit *nk; int err; @@ -552,12 +380,6 @@ static int netkit_new_link(struct net_device *dev, headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); if (data[IFLA_NETKIT_TAILROOM]) tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); - if (data[IFLA_NETKIT_PAIRING]) - pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); - - seen_peer = data[IFLA_NETKIT_PEER_INFO] || - data[IFLA_NETKIT_PEER_SCRUB] || - data[IFLA_NETKIT_PEER_POLICY]; } if (ifmp && tbp[IFLA_IFNAME]) { @@ -570,46 +392,45 @@ static int netkit_new_link(struct net_device *dev, if (mode != NETKIT_L2 && (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; - if (pair == NETKIT_DEVICE_SINGLE && - (tb != tbp || seen_peer || policy_prim != NETKIT_PASS)) - return -EOPNOTSUPP; - if (pair == NETKIT_DEVICE_PAIR) { - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, - &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) - return PTR_ERR(peer); - - netif_inherit_tso_max(peer, dev); - if (headroom) - peer->needed_headroom = headroom; - if (tailroom) - peer->needed_tailroom = tailroom; - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) - eth_hw_addr_random(peer); - if (ifmp && dev->ifindex) - peer->ifindex = ifmp->ifi_index; + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) + return PTR_ERR(peer); - nk = netkit_priv(peer); - nk->primary = false; - nk->policy = policy_peer; - nk->scrub = scrub_peer; - nk->mode = mode; - nk->pair = pair; - nk->headroom = headroom; - bpf_mprog_bundle_init(&nk->bundle); - - err = register_netdevice(peer); - if (err < 0) - goto err_register_peer; - netif_carrier_off(peer); - if (mode == NETKIT_L2) - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); - - err = rtnl_configure_link(peer, NULL, 0, NULL); - if (err < 0) - goto err_configure_peer; + netif_inherit_tso_max(peer, dev); + if (headroom) { + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; } + if (tailroom) { + peer->needed_tailroom = tailroom; + dev->needed_tailroom = tailroom; + } + + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = policy_peer; + nk->scrub = scrub_peer; + nk->mode = mode; + nk->headroom = headroom; + bpf_mprog_bundle_init(&nk->bundle); + + err = register_netdevice(peer); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -617,17 +438,12 @@ static int netkit_new_link(struct net_device *dev, nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else strscpy(dev->name, "nk%d", IFNAMSIZ); - if (headroom) - dev->needed_headroom = headroom; - if (tailroom) - dev->needed_tailroom = tailroom; nk = netkit_priv(dev); nk->primary = true; nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; - nk->pair = pair; nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); @@ -639,12 +455,10 @@ static int netkit_new_link(struct net_device *dev, dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); rcu_assign_pointer(netkit_priv(dev)->peer, peer); - if (peer) - rcu_assign_pointer(netkit_priv(peer)->peer, dev); + rcu_assign_pointer(netkit_priv(peer)->peer, dev); return 0; err_configure_peer: - if (peer) - unregister_netdevice(peer); + unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); @@ -704,8 +518,6 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi nk = netkit_priv(dev); if (!nk->primary) return ERR_PTR(-EACCES); - if (nk->pair == NETKIT_DEVICE_SINGLE) - return ERR_PTR(-EOPNOTSUPP); if (which == BPF_NETKIT_PEER) { dev = rcu_dereference_rtnl(nk->peer); if (!dev) @@ -1032,7 +844,6 @@ static void netkit_release_all(struct net_device *dev) static void netkit_uninit(struct net_device *dev) { netkit_release_all(dev); - netkit_queue_unlease(dev); } static void netkit_del_link(struct net_device *dev, struct list_head *head) @@ -1068,7 +879,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], { IFLA_NETKIT_PEER_INFO, "peer info" }, { IFLA_NETKIT_HEADROOM, "headroom" }, { IFLA_NETKIT_TAILROOM, "tailroom" }, - { IFLA_NETKIT_PAIRING, "pairing" }, }; if (!nk->primary) { @@ -1088,11 +898,9 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_NETKIT_POLICY]) { - err = -EOPNOTSUPP; attr = data[IFLA_NETKIT_POLICY]; policy = nla_get_u32(attr); - if (nk->pair == NETKIT_DEVICE_PAIR) - err = netkit_check_policy(policy, attr, extack); + err = netkit_check_policy(policy, attr, extack); if (err) return err; WRITE_ONCE(nk->policy, policy); @@ -1113,48 +921,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return 0; } -static void netkit_check_lease_unregister(struct net_device *dev) -{ - LIST_HEAD(list_kill); - u32 q_idx; - - if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || - !dev->dev.parent) - return; - - netdev_lock_ops(dev); - for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { - struct net_device *tmp = dev; - u32 tmp_q_idx = q_idx; - - if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) { - if (tmp->netdev_ops != &netkit_netdev_ops) - continue; - /* A single phys device can have multiple queues leased - * to one netkit device. We can only queue that netkit - * device once to the list_kill. Queues of that phys - * device can be leased with different individual netkit - * devices, hence we batch via list_kill. - */ - if (unregister_netdevice_queued(tmp)) - continue; - netkit_del_link(tmp, &list_kill); - } - } - netdev_unlock_ops(dev); - unregister_netdevice_many(&list_kill); -} - -static int netkit_notifier(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (event == NETDEV_UNREGISTER) - netkit_check_lease_unregister(dev); - return NOTIFY_DONE; -} - static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ @@ -1165,7 +931,6 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ - nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 0; } @@ -1186,8 +951,6 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) return -EMSGSIZE; - if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) - return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -1209,15 +972,13 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), - [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; static struct rtnl_link_ops netkit_link_ops = { - .kind = NETKIT_DRV_NAME, + .kind = DRV_NAME, .priv_size = sizeof(struct netkit), - .alloc = netkit_alloc, .setup = netkit_setup, .newlink = netkit_new_link, .dellink = netkit_del_link, @@ -1231,39 +992,26 @@ static struct rtnl_link_ops netkit_link_ops = { .maxtype = IFLA_NETKIT_MAX, }; -static struct notifier_block netkit_netdev_notifier = { - .notifier_call = netkit_notifier, -}; - -static __init int netkit_mod_init(void) +static __init int netkit_init(void) { - int ret; - BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || (int)NETKIT_PASS != (int)TCX_PASS || (int)NETKIT_DROP != (int)TCX_DROP || (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); - ret = rtnl_link_register(&netkit_link_ops); - if (ret) - return ret; - ret = register_netdevice_notifier(&netkit_netdev_notifier); - if (ret) - rtnl_link_unregister(&netkit_link_ops); - return ret; + return rtnl_link_register(&netkit_link_ops); } -static __exit void netkit_mod_exit(void) +static __exit void netkit_exit(void) { - unregister_netdevice_notifier(&netkit_netdev_notifier); rtnl_link_unregister(&netkit_link_ops); } -module_init(netkit_mod_init); -module_exit(netkit_mod_exit); +module_init(netkit_init); +module_exit(netkit_exit); MODULE_DESCRIPTION("BPF-programmable network device"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_AUTHOR("Nikolay Aleksandrov "); MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d146c000e21..d99b0fbc1942 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3400,17 +3400,11 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); - static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } -static inline bool unregister_netdevice_queued(const struct net_device *dev) -{ - return !list_empty(&dev->unreg_list); -} - int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 81dc7cb2360c..b55d3b9cb9c2 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -130,11 +130,6 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used * for this queue. Return NULL on error. * - * @ndo_queue_create: Create a new RX queue which can be leased to another queue. - * Ops on this queue are redirected to the leased queue e.g. - * when opening a memory provider. Return the new queue id on - * success. Return negative error code on failure. - * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -154,12 +149,9 @@ struct netdev_queue_mgmt_ops { int idx); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); - int (*ndo_queue_create)(struct net_device *dev); }; -bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); -bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); -bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); /** * DOC: Lockless queue stopping / waking helpers. @@ -348,10 +340,5 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) }) struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); -bool netdev_can_create_queue(const struct net_device *dev, - struct netlink_ext_ack *extack); -bool netdev_can_lease_queue(const struct net_device *dev, - struct netlink_ext_ack *extack); -bool netdev_queue_busy(struct net_device *dev, int idx, - struct netlink_ext_ack *extack); -#endif /* _LINUX_NET_QUEUES_H */ + +#endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 508d11afaecb..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -28,8 +28,6 @@ struct netdev_rx_queue { #endif struct napi_struct *napi; struct pp_memory_provider_params mp_params; - struct netdev_rx_queue *lease; - netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -59,22 +57,5 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) } int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); -void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src); -void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src); -bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq); -enum netif_lease_dir { - NETIF_VIRT_TO_PHYS, - NETIF_PHYS_TO_VIRT, -}; - -struct netdev_rx_queue * -__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, - enum netif_lease_dir dir); -struct netdev_rx_queue * -netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); -void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, - struct net_device *dev); -#endif /* _LINUX_NETDEV_RX_QUEUE_H */ +#endif diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b6f811c3416b..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); -int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack); -void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p); diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c07cfb431eac..242e34f771cc 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); -struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bbd565757298..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,11 +1296,6 @@ enum netkit_mode { NETKIT_L3, }; -enum netkit_pairing { - NETKIT_DEVICE_PAIR, - NETKIT_DEVICE_SINGLE, -}; - /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1325,7 +1320,6 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, - IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7df1056a35fd..e0b579a1df4f 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,7 +160,6 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, - NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -203,15 +202,6 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; -enum { - NETDEV_A_LEASE_IFINDEX = 1, - NETDEV_A_LEASE_QUEUE, - NETDEV_A_LEASE_NETNS_ID, - - __NETDEV_A_LEASE_MAX, - NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) -}; - enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -238,7 +228,6 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, - NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 13a3de63a825..2661b68f5be3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,13 +1114,6 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) return __netdev_put_lock_ops_compat(dev, net); } -struct net_device * -netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker) -{ - netdev_tracker_free(dev, tracker); - return __netdev_put_lock(dev, dev_net(dev)); -} - struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) diff --git a/net/core/dev.h b/net/core/dev.h index 9bcb76b325d0..da18536cbd35 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,8 +30,6 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); -struct net_device *netdev_put_lock(struct net_device *dev, - netdevice_tracker *tracker); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 52ba99c019e7..ba673e81716f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -28,12 +28,6 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range }; /* Common nested types */ -const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { - [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), - [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), - [NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, }, -}; - const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), @@ -113,13 +107,6 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; -/* NETDEV_CMD_QUEUE_CREATE - do */ -static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { - [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), - [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), - [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), -}; - /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -218,13 +205,6 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_CMD_CAP_DO, }, - { - .cmd = NETDEV_CMD_QUEUE_CREATE, - .doit = netdev_nl_queue_create_doit, - .policy = netdev_queue_create_nl_policy, - .maxattr = NETDEV_A_QUEUE_LEASE, - .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, - }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index d71b435d72c1..cffc08517a41 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -14,7 +14,6 @@ #include /* Common nested types */ -extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; @@ -37,7 +36,6 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); -int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 51c830f88f10..470fabbeacd9 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -391,11 +391,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; - struct net_device *orig_netdev = netdev; - struct nlattr *nest_lease, *nest_queue; struct netdev_rx_queue *rxq; struct netdev_queue *txq; - u32 lease_q_idx = q_idx; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -413,37 +410,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; - if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) { - struct net *net, *peer_net; - - nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); - if (!nest_lease) - goto nla_put_failure; - nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); - if (!nest_queue) - goto nla_put_failure; - if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx)) - goto nla_put_failure; - if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) - goto nla_put_failure; - nla_nest_end(rsp, nest_queue); - if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, - READ_ONCE(netdev->ifindex))) - goto nla_put_failure; - rcu_read_lock(); - peer_net = dev_net_rcu(netdev); - net = dev_net_rcu(orig_netdev); - if (!net_eq(net, peer_net)) { - s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); - - if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) - goto nla_put_failure_unlock; - } - rcu_read_unlock(); - nla_nest_end(rsp, nest_lease); - netdev = orig_netdev; - } - params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) @@ -471,8 +437,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, return 0; -nla_put_failure_unlock: - rcu_read_unlock(); nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; @@ -1156,155 +1120,6 @@ err_genlmsg_free: return err; } -int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) -{ - const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; - const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; - int err, ifindex, ifindex_lease, queue_id, queue_id_lease; - struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; - struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; - struct netdev_rx_queue *rxq, *rxq_lease; - struct net_device *dev, *dev_lease; - netdevice_tracker dev_tracker; - struct nlattr *nest; - struct sk_buff *rsp; - void *hdr; - - if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || - GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || - GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) - return -EINVAL; - if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != - NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); - return -EINVAL; - } - - ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - - nest = info->attrs[NETDEV_A_QUEUE_LEASE]; - err = nla_parse_nested(ltb, lmaxtype, nest, - netdev_lease_nl_policy, info->extack); - if (err < 0) - return err; - if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || - NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) - return -EINVAL; - if (ltb[NETDEV_A_LEASE_NETNS_ID]) { - NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]); - return -EINVAL; - } - - ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); - - nest = ltb[NETDEV_A_LEASE_QUEUE]; - err = nla_parse_nested(qtb, qmaxtype, nest, - netdev_queue_id_nl_policy, info->extack); - if (err < 0) - return err; - if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || - NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) - return -EINVAL; - if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); - return -EINVAL; - } - if (ifindex == ifindex_lease) { - NL_SET_ERR_MSG(info->extack, - "Lease ifindex cannot be the same as queue creation ifindex"); - return -EINVAL; - } - - queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); - - rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!rsp) - return -ENOMEM; - - hdr = genlmsg_iput(rsp, info); - if (!hdr) { - err = -EMSGSIZE; - goto err_genlmsg_free; - } - - /* Locking order is always from the virtual to the physical device - * since this is also the same order when applications open the - * memory provider later on. - */ - dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); - if (!dev) { - err = -ENODEV; - goto err_genlmsg_free; - } - if (!netdev_can_create_queue(dev, info->extack)) { - err = -EINVAL; - goto err_unlock_dev; - } - - dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease, - &dev_tracker, GFP_KERNEL); - if (!dev_lease) { - err = -ENODEV; - goto err_unlock_dev; - } - if (!netdev_can_lease_queue(dev_lease, info->extack)) { - netdev_put(dev_lease, &dev_tracker); - err = -EINVAL; - goto err_unlock_dev; - } - - dev_lease = netdev_put_lock(dev_lease, &dev_tracker); - if (!dev_lease) { - err = -ENODEV; - goto err_unlock_dev; - } - if (queue_id_lease >= dev_lease->real_num_rx_queues) { - err = -ERANGE; - NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); - goto err_unlock_dev_lease; - } - if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) { - err = -EBUSY; - goto err_unlock_dev_lease; - } - - rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); - rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); - - if (rxq->lease && rxq->lease->dev != dev_lease) { - err = -EOPNOTSUPP; - NL_SET_ERR_MSG(info->extack, - "Leasing multiple queues from different devices not supported"); - goto err_unlock_dev_lease; - } - - err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev); - if (err < 0) { - NL_SET_ERR_MSG(info->extack, - "Device is unable to create a new queue"); - goto err_unlock_dev_lease; - } - - rxq = __netif_get_rx_queue(dev, queue_id); - netdev_rx_queue_lease(rxq, rxq_lease); - - nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); - genlmsg_end(rsp, hdr); - - netdev_unlock(dev_lease); - netdev_unlock(dev); - - return genlmsg_reply(rsp, info); - -err_unlock_dev_lease: - netdev_unlock(dev_lease); -err_unlock_dev: - netdev_unlock(dev); -err_genlmsg_free: - nlmsg_free(rsp); - return err; -} - void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c index 97acf6440829..251f27a8307f 100644 --- a/net/core/netdev_queues.c +++ b/net/core/netdev_queues.c @@ -1,37 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include /** * netdev_queue_get_dma_dev() - get dma device for zero-copy operations * @dev: net_device * @idx: queue index * - * Get dma device for zero-copy operations to be used for this queue. If the - * queue is leased to a physical queue, we retrieve the latter's dma device. + * Get dma device for zero-copy operations to be used for this queue. * When such device is not available or valid, the function will return NULL. * * Return: Device or NULL on error */ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) { - const struct netdev_queue_mgmt_ops *queue_ops; + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; struct device *dma_dev; - if (idx < dev->real_num_rx_queues) { - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); - - if (rxq->lease) { - rxq = rxq->lease; - dev = rxq->dev; - idx = get_netdev_rx_queue_index(rxq); - } - } - - queue_ops = dev->queue_mgmt_ops; - if (queue_ops && queue_ops->ndo_queue_get_dma_dev) dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); else @@ -40,58 +25,3 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; } -bool netdev_can_create_queue(const struct net_device *dev, - struct netlink_ext_ack *extack) -{ - if (dev->dev.parent) { - NL_SET_ERR_MSG(extack, "Device is not a virtual device"); - return false; - } - if (!dev->queue_mgmt_ops || - !dev->queue_mgmt_ops->ndo_queue_create) { - NL_SET_ERR_MSG(extack, "Device does not support queue creation"); - return false; - } - if (dev->real_num_rx_queues < 1 || - dev->real_num_tx_queues < 1) { - NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); - return false; - } - return true; -} - -bool netdev_can_lease_queue(const struct net_device *dev, - struct netlink_ext_ack *extack) -{ - if (!dev->dev.parent) { - NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); - return false; - } - if (!netif_device_present(dev)) { - NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); - return false; - } - if (!dev->queue_mgmt_ops) { - NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); - return false; - } - return true; -} - -bool netdev_queue_busy(struct net_device *dev, int idx, - struct netlink_ext_ack *extack) -{ - if (netif_rxq_is_leased(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue is already leased"); - return true; - } - if (xsk_get_pool_from_qid(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP"); - return true; - } - if (netif_rxq_has_mp(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider"); - return true; - } - return false; -} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 75c7a68cb90d..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,120 +9,14 @@ #include "page_pool_priv.h" -void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src) -{ - netdev_assert_locked(rxq_src->dev); - netdev_assert_locked(rxq_dst->dev); - - netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); - - WRITE_ONCE(rxq_src->lease, rxq_dst); - WRITE_ONCE(rxq_dst->lease, rxq_src); -} - -void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src) -{ - netdev_assert_locked(rxq_dst->dev); - netdev_assert_locked(rxq_src->dev); - - WRITE_ONCE(rxq_src->lease, NULL); - WRITE_ONCE(rxq_dst->lease, NULL); - - netdev_put(rxq_src->dev, &rxq_src->lease_tracker); -} - -bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) -{ - if (rxq_idx < dev->real_num_rx_queues) - return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); - return false; -} - -static bool netif_lease_dir_ok(const struct net_device *dev, - enum netif_lease_dir dir) -{ - if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) - return true; - if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) - return true; - return false; -} - -struct netdev_rx_queue * -__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, - enum netif_lease_dir dir) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); - - if (rxq->lease) { - if (!netif_lease_dir_ok(orig_dev, dir)) - return NULL; - rxq = rxq->lease; - *rxq_idx = get_netdev_rx_queue_index(rxq); - *dev = rxq->dev; - } - return rxq; -} - -struct netdev_rx_queue * -netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq; - - /* Locking order is always from the virtual to the physical device - * see netdev_nl_queue_create_doit(). - */ - netdev_ops_assert_locked(orig_dev); - rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); - if (rxq && orig_dev != *dev) - netdev_lock(*dev); - return rxq; -} - -void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, - struct net_device *dev) -{ - if (orig_dev != dev) - netdev_unlock(dev); -} - -bool netif_rx_queue_lease_get_owner(struct net_device **dev, - unsigned int *rxq_idx) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq; - - /* The physical device needs to be locked. If there is indeed a lease, - * then the virtual device holds a reference on the physical device - * and the lease stays active until the virtual device is torn down. - * When queues get {un,}leased both devices are always locked. - */ - netdev_ops_assert_locked(orig_dev); - rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT); - if (rxq && orig_dev != *dev) - return true; - return false; -} - /* See also page_pool_is_unreadable() */ -bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) { - if (rxq_idx < dev->real_num_rx_queues) - return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; - return false; -} -EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); -bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) -{ - if (rxq_idx < dev->real_num_rx_queues) - return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; - return false; + return !!rxq->mp_params.mp_ops; } +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { @@ -206,63 +100,49 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { - struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int ret; if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; + if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); - if (!rxq) { - NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev"); - return -EBUSY; - } - if (!dev->dev.parent) { - NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev"); - ret = -EBUSY; - goto out; - } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - ret = -EINVAL; - goto out; + return -EINVAL; } if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - ret = -EINVAL; - goto out; + return -EINVAL; } if (dev_xdp_prog_count(dev)) { NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); - ret = -EEXIST; - goto out; + return -EEXIST; } + + rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - ret = -EEXIST; - goto out; + return -EEXIST; } #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) { NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - ret = -EBUSY; - goto out; + return -EBUSY; } #endif + rxq->mp_params = *p; ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; } -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); return ret; } @@ -277,43 +157,38 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, return ret; } -void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { - struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int err; - if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues)) + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) return; - rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); - if (WARN_ON_ONCE(!rxq)) - return; + rxq = __netif_get_rx_queue(dev, ifq_idx); /* Callers holding a netdev ref may get here after we already * went thru shutdown via dev_memory_provider_uninstall(). */ if (dev->reg_state > NETREG_REGISTERED && !rxq->mp_params.mp_ops) - goto out; + return; if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || rxq->mp_params.mp_priv != old_p->mp_priv)) - goto out; + return; rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, rxq_idx); + err = netdev_rx_queue_restart(dev, ifq_idx); WARN_ON(err && err != -ENETDOWN); -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); } -void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p) { netdev_lock(dev); - __net_mp_close_rxq(dev, rxq_idx, old_p); + __net_mp_close_rxq(dev, ifq_idx, old_p); netdev_unlock(dev); } diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 797d2a08c515..ca4f80282448 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include "netlink.h" #include "common.h" @@ -169,16 +169,14 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) if (ret) return ret; - /* ensure channels are not busy at the moment */ + /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); - for (i = from_channel; i < old_total; i++) { - if (netdev_queue_busy(dev, i, NULL)) { - GENL_SET_ERR_MSG(info, - "requested channel counts are too low due to busy queues (AF_XDP or queue leasing)"); + for (i = from_channel; i < old_total; i++) + if (xsk_get_pool_from_qid(dev, i)) { + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } - } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 02a3454234d6..9431e305b233 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -27,13 +27,12 @@ #include #include #include -#include #include #include +#include #include #include -#include - +#include #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ @@ -2283,12 +2282,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (ret) return ret; - /* Disabling channels, query busy queues (AF_XDP, queue leasing) */ + /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) - if (netdev_queue_busy(dev, i, NULL)) + if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 92f791433725..3b46bc635c43 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,8 +23,6 @@ #include #include #include - -#include #include #include #include @@ -105,7 +103,7 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xsk_uses_need_wakeup); -struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) @@ -119,18 +117,10 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - struct net_device *orig_dev = dev; - unsigned int id = queue_id; - - if (id < dev->real_num_rx_queues) - WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id)); - - if (id < dev->real_num_rx_queues) - dev->_rx[id].pool = NULL; - if (id < dev->real_num_tx_queues) - dev->_tx[id].pool = NULL; - - netif_put_rx_queue_lease_locked(orig_dev, dev); + if (queue_id < dev->num_rx_queues) + dev->_rx[queue_id].pool = NULL; + if (queue_id < dev->num_tx_queues) + dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do @@ -140,29 +130,17 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { - struct net_device *orig_dev = dev; - unsigned int id = queue_id; - int ret = 0; - - if (id >= max(dev->real_num_rx_queues, - dev->real_num_tx_queues)) + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) return -EINVAL; - if (id < dev->real_num_rx_queues) { - if (!netif_get_rx_queue_lease_locked(&dev, &id)) - return -EBUSY; - if (xsk_get_pool_from_qid(dev, id)) { - ret = -EBUSY; - goto out; - } - } - if (id < dev->real_num_rx_queues) - dev->_rx[id].pool = pool; - if (id < dev->real_num_tx_queues) - dev->_tx[id].pool = pool; -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); - return ret; + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, @@ -346,37 +324,14 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static bool xsk_dev_queue_valid(const struct xdp_sock *xs, - const struct xdp_rxq_info *info) -{ - struct net_device *dev = xs->dev; - u32 queue_index = xs->queue_id; - struct netdev_rx_queue *rxq; - - if (info->dev == dev && - info->queue_index == queue_index) - return true; - - if (queue_index < dev->real_num_rx_queues) { - rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); - if (!rxq) - return false; - - dev = rxq->dev; - queue_index = get_netdev_rx_queue_index(rxq); - - return info->dev == dev && - info->queue_index == queue_index; - } - return false; -} - static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; - if (!xsk_dev_queue_valid(xs, xdp->rxq)) + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7df1056a35fd..e0b579a1df4f 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -160,7 +160,6 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, - NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -203,15 +202,6 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; -enum { - NETDEV_A_LEASE_IFINDEX = 1, - NETDEV_A_LEASE_QUEUE, - NETDEV_A_LEASE_NETNS_ID, - - __NETDEV_A_LEASE_MAX, - NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) -}; - enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -238,7 +228,6 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, - NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index b94e81c2e030..eb838ae94844 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -62,13 +62,6 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 Local and remote endpoint IP addresses. -LOCAL_PREFIX_V4, LOCAL_PREFIX_V6 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Local IP prefix/subnet which can be used to allocate extra IP addresses (for -network name spaces behind macvlan, veth, netkit devices). DUT must be -reachable using these addresses from the endpoint. - REMOTE_TYPE ~~~~~~~~~~~ diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 39ad86d693b3..9c163ba6feee 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -32,8 +32,6 @@ TEST_PROGS = \ irq.py \ loopback.sh \ nic_timestamp.py \ - nk_netns.py \ - nk_qlease.py \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index 022008249313..d5d247eca6b7 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -3,7 +3,6 @@ """ Driver test environment (hardware-only tests). NetDrvEnv and NetDrvEpEnv are the main environment classes. -NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -30,7 +29,7 @@ try: from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner - from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", @@ -45,8 +44,8 @@ try: "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", - "Remote", "Iperf3Runner"] + "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c deleted file mode 100644 index 86ebfc1445b6..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include - -#define TC_ACT_OK 0 -#define ETH_P_IPV6 0x86DD - -#define ctx_ptr(field) ((void *)(long)(field)) - -#define v6_p64_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ - a.s6_addr32[1] == b.s6_addr32[1]) - -volatile __u32 netkit_ifindex; -volatile __u8 ipv6_prefix[16]; - -SEC("tc/ingress") -int tc_redirect_peer(struct __sk_buff *skb) -{ - void *data_end = ctx_ptr(skb->data_end); - void *data = ctx_ptr(skb->data); - struct in6_addr *peer_addr; - struct ipv6hdr *ip6h; - struct ethhdr *eth; - - peer_addr = (struct in6_addr *)ipv6_prefix; - - if (skb->protocol != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - eth = data; - if ((void *)(eth + 1) > data_end) - return TC_ACT_OK; - - ip6h = data + sizeof(struct ethhdr); - if ((void *)(ip6h + 1) > data_end) - return TC_ACT_OK; - - if (!v6_p64_equal(ip6h->daddr, (*peer_addr))) - return TC_ACT_OK; - - return bpf_redirect_peer(netkit_ifindex, 0); -} - -char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py deleted file mode 100755 index afa8638195d8..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_netns.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -from lib.py import ksft_run, ksft_exit -from lib.py import NetDrvContEnv -from lib.py import cmd - - -def test_ping(cfg) -> None: - cfg.require_ipver("6") - - cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote) - cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns) - - -def main() -> None: - with NetDrvContEnv(__file__) as cfg: - ksft_run([test_ping], args=(cfg,)) - ksft_exit() - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py deleted file mode 100755 index 738a46d2d20c..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_qlease.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -import re -from os import path -from lib.py import ksft_run, ksft_exit -from lib.py import NetDrvContEnv -from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen - - -def create_rss_ctx(cfg): - output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout - values = re.search(r'New RSS context is (\d+)', output).group(1) - return int(values) - - -def set_flow_rule(cfg): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout - values = re.search(r'ID (\d+)', output).group(1) - return int(values) - - -def set_flow_rule_rss(cfg, rss_ctx_id): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout - values = re.search(r'ID (\d+)', output).group(1) - return int(values) - - -def test_iou_zcrx(cfg) -> None: - cfg.require_ipver('6') - - ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") - defer(ethtool, f"-X {cfg.ifname} default") - - flow_rule_id = set_flow_rule(cfg) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - - rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" - with bkg(rx_cmd, exit_wait=True): - wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) - cmd(tx_cmd, host=cfg.remote) - - -def main() -> None: - with NetDrvContEnv(__file__, lease=True) as cfg: - cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") - cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - cfg.port = rand_port() - ksft_run([test_iou_zcrx], args=(cfg,)) - ksft_exit() - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index be3a8a936882..8b75faa9af6d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -3,7 +3,6 @@ """ Driver test environment. NetDrvEnv and NetDrvEpEnv are the main environment classes. -NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -44,12 +43,12 @@ try: "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] - from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv + from .env import NetDrvEnv, NetDrvEpEnv from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", - "Remote", "Iperf3Runner"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 7066d78395c6..41cc248ac848 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 -import ipaddress import os -import re import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev -from lib.py import NetdevFamily, EthtoolFamily from .remote import Remote -from . import bpftool class NetDrvEnvBase: @@ -293,156 +289,3 @@ class NetDrvEpEnv(NetDrvEnvBase): data.get('stats-block-usecs', 0) / 1000 / 1000 time.sleep(self._stats_settle_time) - - -class NetDrvContEnv(NetDrvEpEnv): - """ - Class for an environment with a netkit pair setup for forwarding traffic - between the physical interface and a network namespace. - """ - - def __init__(self, src_path, lease=False, **kwargs): - super().__init__(src_path, **kwargs) - - self.require_ipver("6") - local_prefix = self.env.get("LOCAL_PREFIX_V6") - if not local_prefix: - raise KsftSkipEx("LOCAL_PREFIX_V6 required") - - self.netdevnl = NetdevFamily() - self.ethnl = EthtoolFamily() - - local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") - self.ipv6_prefix = f"{local_prefix}::" - self.nk_host_ipv6 = f"{local_prefix}::2:1" - self.nk_guest_ipv6 = f"{local_prefix}::2:2" - - self.netns = None - self._nk_host_ifname = None - self._nk_guest_ifname = None - self._tc_attached = False - self._bpf_prog_pref = None - self._bpf_prog_id = None - self._leased = False - - nk_rxqueues = 1 - if lease: - nk_rxqueues = 2 - ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") - - all_links = ip("-d link show", json=True) - netkit_links = [link for link in all_links - if link.get('linkinfo', {}).get('info_kind') == 'netkit' - and 'UP' not in link.get('flags', [])] - - if len(netkit_links) != 2: - raise KsftSkipEx("Failed to create netkit pair") - - netkit_links.sort(key=lambda x: x['ifindex']) - self._nk_host_ifname = netkit_links[1]['ifname'] - self._nk_guest_ifname = netkit_links[0]['ifname'] - self.nk_host_ifindex = netkit_links[1]['ifindex'] - self.nk_guest_ifindex = netkit_links[0]['ifindex'] - - if lease: - self._lease_queues() - - self._setup_ns() - self._attach_bpf() - - def __del__(self): - if self._tc_attached: - cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") - self._tc_attached = False - - if self._nk_host_ifname: - cmd(f"ip link del dev {self._nk_host_ifname}") - self._nk_host_ifname = None - self._nk_guest_ifname = None - - if self.netns: - del self.netns - self.netns = None - - if self._leased: - self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, - 'tcp-data-split': 'unknown', - 'hds-thresh': self._hds_thresh, - 'rx': self._rx_rings}) - self._leased = False - - super().__del__() - - def _lease_queues(self): - channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}}) - channels = channels['combined-count'] - if channels < 2: - raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') - - rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}}) - self._rx_rings = rings['rx'] - self._hds_thresh = rings.get('hds-thresh', 0) - self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, - 'tcp-data-split': 'enabled', - 'hds-thresh': 0, - 'rx': 64}) - self.src_queue = channels - 1 - bind_result = self.netdevnl.queue_create( - { - "ifindex": self.nk_guest_ifindex, - "type": "rx", - "lease": { - "ifindex": self.ifindex, - "queue": {"id": self.src_queue, "type": "rx"}, - }, - } - ) - self.nk_queue = bind_result['id'] - self._leased = True - - def _setup_ns(self): - self.netns = NetNS() - ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") - ip(f"link set dev {self._nk_host_ifname} up") - ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") - ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") - - ip("link set lo up", ns=self.netns) - ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) - ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) - ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) - ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) - - def _attach_bpf(self): - bpf_obj = self.test_dir / "nk_forward.bpf.o" - if not bpf_obj.exists(): - raise KsftSkipEx("BPF prog not found") - - cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action") - self._tc_attached = True - - tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout - match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info) - if not match: - raise Exception("Failed to get BPF prog ID") - self._bpf_prog_pref = int(match.group(1)) - self._bpf_prog_id = int(match.group(2)) - - prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) - map_ids = prog_info.get("map_ids", []) - - bss_map_id = None - for map_id in map_ids: - map_info = bpftool(f"map show id {map_id}", json=True) - if map_info.get("name").endswith("bss"): - bss_map_id = map_id - - if bss_map_id is None: - raise Exception("Failed to find .bss map") - - ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) - ipv6_bytes = ipv6_addr.packed - ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little') - value = ipv6_bytes + ifindex_bytes - value_hex = ' '.join(f'{b:02x}' for b in value) - bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") -- cgit v1.2.3 From 49743f27268ffbe2029d9c8fdfbd04d0869bd51d Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 16 Jan 2026 06:21:21 +0000 Subject: selftests: drv-net: extend HW timestamp test with ioctl Extend HW timestamp tests to check that ioctl interface is not broken and configuration setups and requests are equal to netlink interface. Some linter warnings are disabled because of ctypes classes. Reviewed-by: Kory Maincent Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20260116062121.1230184-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/nic_timestamp.py | 128 +++++++++++++++++++-- 1 file changed, 120 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py index c1e943d53f19..c632b41e7a23 100755 --- a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py +++ b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py @@ -1,15 +1,38 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +# pylint: disable=locally-disabled, invalid-name, attribute-defined-outside-init, too-few-public-methods """ Tests related to configuration of HW timestamping """ import errno +import ctypes +import fcntl +import socket from lib.py import ksft_run, ksft_exit, ksft_ge, ksft_eq, KsftSkipEx from lib.py import NetDrvEnv, EthtoolFamily, NlError +SIOCSHWTSTAMP = 0x89b0 +SIOCGHWTSTAMP = 0x89b1 +class hwtstamp_config(ctypes.Structure): + """ Python copy of struct hwtstamp_config """ + _fields_ = [ + ("flags", ctypes.c_int), + ("tx_type", ctypes.c_int), + ("rx_filter", ctypes.c_int), + ] + + +class ifreq(ctypes.Structure): + """ Python copy of struct ifreq """ + _fields_ = [ + ("ifr_name", ctypes.c_char * 16), + ("ifr_data", ctypes.POINTER(hwtstamp_config)), + ] + + def __get_hwtimestamp_support(cfg): """ Retrieve supported configuration information """ @@ -31,8 +54,29 @@ def __get_hwtimestamp_support(cfg): return ctx +def __get_hwtimestamp_config_ioctl(cfg): + """ Retrieve current TS configuration information (via ioctl) """ + + config = hwtstamp_config() + + req = ifreq() + req.ifr_name = cfg.ifname.encode() + req.ifr_data = ctypes.pointer(config) + + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + fcntl.ioctl(sock.fileno(), SIOCGHWTSTAMP, req) + sock.close() + + except OSError as e: + if e.errno == errno.EOPNOTSUPP: + raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e + raise + return config + + def __get_hwtimestamp_config(cfg): - """ Retrieve current TS configuration information """ + """ Retrieve current TS configuration information (via netLink) """ try: tscfg = cfg.ethnl.tsconfig_get({'header': {'dev-name': cfg.ifname}}) @@ -43,8 +87,27 @@ def __get_hwtimestamp_config(cfg): return tscfg +def __set_hwtimestamp_config_ioctl(cfg, ts): + """ Setup new TS configuration information (via ioctl) """ + config = hwtstamp_config() + config.rx_filter = ts['rx-filters']['bits']['bit'][0]['index'] + config.tx_type = ts['tx-types']['bits']['bit'][0]['index'] + req = ifreq() + req.ifr_name = cfg.ifname.encode() + req.ifr_data = ctypes.pointer(config) + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + fcntl.ioctl(sock.fileno(), SIOCSHWTSTAMP, req) + sock.close() + + except OSError as e: + if e.errno == errno.EOPNOTSUPP: + raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e + raise + + def __set_hwtimestamp_config(cfg, ts): - """ Setup new TS configuration information """ + """ Setup new TS configuration information (via netlink) """ ts['header'] = {'dev-name': cfg.ifname} try: @@ -56,9 +119,9 @@ def __set_hwtimestamp_config(cfg, ts): return res -def test_hwtstamp_tx(cfg): +def __perform_hwtstamp_tx(cfg, is_ioctl): """ - Test TX timestamp configuration. + Test TX timestamp configuration via either netlink or ioctl. The driver should apply provided config and report back proper state. """ @@ -66,16 +129,37 @@ def test_hwtstamp_tx(cfg): ts = __get_hwtimestamp_support(cfg) tx = ts['tx'] for t in tx: + res = None tscfg = orig_tscfg tscfg['tx-types']['bits']['bit'] = [t] - res = __set_hwtimestamp_config(cfg, tscfg) + if is_ioctl: + __set_hwtimestamp_config_ioctl(cfg, tscfg) + else: + res = __set_hwtimestamp_config(cfg, tscfg) if res is None: res = __get_hwtimestamp_config(cfg) + resioctl = __get_hwtimestamp_config_ioctl(cfg) ksft_eq(res['tx-types']['bits']['bit'], [t]) + ksft_eq(resioctl.tx_type, t['index']) __set_hwtimestamp_config(cfg, orig_tscfg) +def test_hwtstamp_tx_netlink(cfg): + """ + Test TX timestamp configuration setup via netlink. + The driver should apply provided config and report back proper state. + """ + __perform_hwtstamp_tx(cfg, False) + + +def test_hwtstamp_tx_ioctl(cfg): + """ + Test TX timestamp configuration setup via ioctl. + The driver should apply provided config and report back proper state. + """ + __perform_hwtstamp_tx(cfg, True) + -def test_hwtstamp_rx(cfg): +def __perform_hwtstamp_rx(cfg, is_ioctl): """ Test RX timestamp configuration. The filter configuration is taken from the list of supported filters. @@ -87,11 +171,17 @@ def test_hwtstamp_rx(cfg): ts = __get_hwtimestamp_support(cfg) rx = ts['rx'] for r in rx: + res = None tscfg = orig_tscfg tscfg['rx-filters']['bits']['bit'] = [r] - res = __set_hwtimestamp_config(cfg, tscfg) + if is_ioctl: + __set_hwtimestamp_config_ioctl(cfg, tscfg) + else: + res = __set_hwtimestamp_config(cfg, tscfg) if res is None: res = __get_hwtimestamp_config(cfg) + resioctl = __get_hwtimestamp_config_ioctl(cfg) + ksft_eq(resioctl.rx_filter, res['rx-filters']['bits']['bit'][0]['index']) if r['index'] == 0 or r['index'] == 1: ksft_eq(res['rx-filters']['bits']['bit'][0]['index'], r['index']) else: @@ -100,12 +190,34 @@ def test_hwtstamp_rx(cfg): __set_hwtimestamp_config(cfg, orig_tscfg) +def test_hwtstamp_rx_netlink(cfg): + """ + Test RX timestamp configuration via netlink. + The filter configuration is taken from the list of supported filters. + The driver should apply the config without error and report back proper state. + Some extension of the timestamping scope is allowed for PTP filters. + """ + __perform_hwtstamp_rx(cfg, False) + + +def test_hwtstamp_rx_ioctl(cfg): + """ + Test RX timestamp configuration via ioctl. + The filter configuration is taken from the list of supported filters. + The driver should apply the config without error and report back proper state. + Some extension of the timestamping scope is allowed for PTP filters. + """ + __perform_hwtstamp_rx(cfg, True) + + def main() -> None: """ Ksft boiler plate main """ with NetDrvEnv(__file__, nsim_test=False) as cfg: cfg.ethnl = EthtoolFamily() - ksft_run([test_hwtstamp_tx, test_hwtstamp_rx], args=(cfg,)) + ksft_run([test_hwtstamp_tx_ioctl, test_hwtstamp_tx_netlink, + test_hwtstamp_rx_ioctl, test_hwtstamp_rx_netlink], + args=(cfg,)) ksft_exit() -- cgit v1.2.3 From 1802e9079f65cbe47d90d048b06df650a91407f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Jan 2026 10:03:19 -0800 Subject: selftests: drv-net: fix missing include in ncdevmem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ca9d74eb5f6a ("uapi: add INT_MAX and INT_MIN constants") recently removed some includes of limits.h in uAPI headers. ncdevmem.c was depending on them: ncdevmem.c: In function ‘ethtool_add_flow’: ncdevmem.c:369:60: error: ‘INT_MAX’ undeclared (first use in this function) 369 | if (endptr == id_start || flow_id < 0 || flow_id > INT_MAX) | ^~~~~~~ ncdevmem.c:77:1: note: ‘INT_MAX’ is defined in header ‘’; did you forget to ‘#include ’? Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20260120180319.1673271-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/ncdevmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 3288ed04ce08..16864c844108 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -48,6 +48,7 @@ #include #define __iovec_defined #include +#include #include #include #include -- cgit v1.2.3 From 9f5edd785da3cf373285259928d7f1f08c9ce758 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 9 Dec 2025 08:47:45 +0530 Subject: tools/mm/thp_swap_allocator_test: fix small folio alignment Use ALIGNMENT_SMALLFOLIO instead of ALIGNMENT_MTHP when allocating small folios to ensure correct memory alignment for the test case. Before: test allocates small folios with 64KB alignment (ALIGNMENT_MTHP) when only 4KB alignment (ALIGNMENT_SMALLFOLIO) is needed. This wastes address space and may cause allocation failures on systems with fragmented memory. Worst-case impact: this only affects thp_swap_allocator_test tool behavior. Link: https://lkml.kernel.org/r/20251209031745.2723120-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- tools/mm/thp_swap_allocator_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c index 83afc52275a5..d4434df3dcff 100644 --- a/tools/mm/thp_swap_allocator_test.c +++ b/tools/mm/thp_swap_allocator_test.c @@ -142,7 +142,7 @@ int main(int argc, char *argv[]) } if (use_small_folio) { - mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP); + mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_SMALLFOLIO); if (mem2 == NULL) { fprintf(stderr, "Failed to allocate small folios memory\n"); free(mem1); -- cgit v1.2.3 From 8b8017d7c411403731ee4d502cdbd76e9425f0e1 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Mon, 8 Dec 2025 16:22:40 +0530 Subject: tools/mm/slabinfo: fix --partial long option mapping The long option "--partial" was incorrectly mapped to lowercase 'p' in the opts[] array, but the getopt string and switch case handle uppercase 'P'. This mismatch caused --partial to be rejected. Fix the long_options mapping to use 'P' so --partial works correctly alongside the existing -P short option. Link: https://lkml.kernel.org/r/20251208105240.2719773-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Reviewed-by: SeongJae Park Tested-by: SeongJae Park Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/mm/slabinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index 80cdbd3db82d..54c7265ab52d 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -1405,7 +1405,7 @@ struct option opts[] = { { "numa", no_argument, NULL, 'n' }, { "lines", required_argument, NULL, 'N'}, { "ops", no_argument, NULL, 'o' }, - { "partial", no_argument, NULL, 'p'}, + { "partial", no_argument, NULL, 'P'}, { "report", no_argument, NULL, 'r' }, { "shrink", no_argument, NULL, 's' }, { "Size", no_argument, NULL, 'S'}, -- cgit v1.2.3 From a98ec863fdedf4940447f32ceda7d937bebd06a2 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Mon, 1 Dec 2025 13:18:48 -0500 Subject: lib/test_vmalloc.c: minor fixes to test_vmalloc.c If PAGE_SIZE is larger than 4k and if you have a system with a large number of CPUs, this test can require a very large amount of memory leading to oom-killer firing. Given the type of allocation, the kernel won't have anything to kill, causing the system to stall. Add a parameter to the test_vmalloc driver to represent the number of times a percpu object will be allocated. Calculate this in test_vmalloc.sh to be 90% of available memory or the current default of 35000, whichever is smaller. Link: https://lkml.kernel.org/r/20251201181848.1216197-1-audra@redhat.com Signed-off-by: Audra Mitchell Reviewed-by: Andrew Morton Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Rafael Aquini Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 11 +++++++---- tools/testing/selftests/mm/test_vmalloc.sh | 31 +++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 6521c05c7816..270b6f7ca807 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -58,6 +58,9 @@ __param(int, run_test_mask, 7, /* Add a new test case description here. */ ); +__param(int, nr_pcpu_objects, 35000, + "Number of pcpu objects to allocate for pcpu_alloc_test"); + /* * This is for synchronization of setup phase. */ @@ -317,24 +320,24 @@ pcpu_alloc_test(void) size_t size, align; int i; - pcpu = vmalloc(sizeof(void __percpu *) * 35000); + pcpu = vmalloc(sizeof(void __percpu *) * nr_pcpu_objects); if (!pcpu) return -1; - for (i = 0; i < 35000; i++) { + for (i = 0; i < nr_pcpu_objects; i++) { size = get_random_u32_inclusive(1, PAGE_SIZE / 4); /* * Maximum PAGE_SIZE */ - align = 1 << get_random_u32_inclusive(1, 11); + align = 1 << get_random_u32_inclusive(1, PAGE_SHIFT - 1); pcpu[i] = __alloc_percpu(size, align); if (!pcpu[i]) rv = -1; } - for (i = 0; i < 35000; i++) + for (i = 0; i < nr_pcpu_objects; i++) free_percpu(pcpu[i]); vfree(pcpu); diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh index d39096723fca..b23d705bf570 100755 --- a/tools/testing/selftests/mm/test_vmalloc.sh +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -13,6 +13,9 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" NUM_CPUS=`grep -c ^processor /proc/cpuinfo` +# Default number of times we allocate percpu objects: +NR_PCPU_OBJECTS=35000 + # 1 if fails exitcode=1 @@ -27,6 +30,8 @@ PERF_PARAM="sequential_test_order=1 test_repeat_count=3" SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" +PCPU_OBJ_PARAM="nr_pcpu_objects=$NR_PCPU_OBJECTS" + check_test_requirements() { uid=$(id -u) @@ -47,12 +52,30 @@ check_test_requirements() fi } +check_memory_requirement() +{ + # The pcpu_alloc_test allocates nr_pcpu_objects per cpu. If the + # PAGE_SIZE is on the larger side it is easier to set a value + # that can cause oom events during testing. Since we are + # testing the functionality of vmalloc and not the oom-killer, + # calculate what is 90% of available memory and divide it by + # the number of online CPUs. + pages=$(($(getconf _AVPHYS_PAGES) * 90 / 100 / $NUM_CPUS)) + + if (($pages < $NR_PCPU_OBJECTS)); then + echo "Updated nr_pcpu_objects to 90% of available memory." + echo "nr_pcpu_objects is now set to: $pages." + PCPU_OBJ_PARAM="nr_pcpu_objects=$pages" + fi +} + run_performance_check() { echo "Run performance tests to evaluate how fast vmalloc allocation is." echo "It runs all test cases on one single CPU with sequential order." - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $PERF_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel message buffer to see the summary." } @@ -63,7 +86,8 @@ run_stability_check() echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $STRESS_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } @@ -74,7 +98,8 @@ run_smoke_check() echo "Please check $0 output how it can be used" echo "for deep performance analysis as well as stress testing." - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $SMOKE_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } -- cgit v1.2.3 From 8e46adb62fae98a866baa7c23f6ed3bfe02e6f88 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:37 +0800 Subject: selftests/mm/write_to_hugetlbfs: parse -s as size_t Patch series "selftests/mm: hugetlb cgroup charging: robustness fixes", v3. This series fixes a few issues in the hugetlb cgroup charging selftests (write_to_hugetlbfs.c + charge_reserved_hugetlb.sh) that show up on systems with large hugepages (e.g. 512MB) and when failures cause the test to wait indefinitely. On an aarch64 64k page kernel with 512MB hugepages, the test consistently fails in write_to_hugetlbfs with ENOMEM and then hangs waiting for the expected usage values. The root cause is that charge_reserved_hugetlb.sh mounts hugetlbfs with a fixed size=256M, which is smaller than a single hugepage, resulting in a mount with size=0 capacity. In addition, write_to_hugetlbfs previously parsed -s via atoi() into an int, which can overflow and print negative sizes. Reproducer / environment: - Kernel: 6.12.0-xxx.el10.aarch64+64k - Hugepagesize: 524288 kB (512MB) - ./charge_reserved_hugetlb.sh -cgroup-v2 - Observed mount: pagesize=512M,size=0 before this series After applying the series, the test completes successfully on the above setup. This patch (of 3): write_to_hugetlbfs currently parses the -s size argument with atoi() into an int. This silently accepts malformed input, cannot report overflow, and can truncate large sizes. === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k # ls /sys/kernel/mm/hugepages/hugepages-* hugepages-16777216kB/ hugepages-2048kB/ hugepages-524288kB/ #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # Writing to this path: /mnt/huge/test # Writing this size: -1610612736 <-------- Switch the size variable to size_t and parse -s with sscanf("%zu", ...). Also print the size using %zu. This avoids incorrect behavior with large -s values and makes the utility more robust. Link: https://lkml.kernel.org/r/20251221122639.3168038-1-liwang@redhat.com Link: https://lkml.kernel.org/r/20251221122639.3168038-2-liwang@redhat.com Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Acked-by: Waiman Long Cc: David Hildenbrand Cc: Mark Brown Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/write_to_hugetlbfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 34c91f7e6128..ecb5f7619960 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -68,7 +68,7 @@ int main(int argc, char **argv) int key = 0; int *ptr = NULL; int c = 0; - int size = 0; + size_t size = 0; char path[256] = ""; enum method method = MAX_METHOD; int want_sleep = 0, private = 0; @@ -86,7 +86,10 @@ int main(int argc, char **argv) while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { switch (c) { case 's': - size = atoi(optarg); + if (sscanf(optarg, "%zu", &size) != 1) { + perror("Invalid -s."); + exit_usage(); + } break; case 'p': strncpy(path, optarg, sizeof(path) - 1); @@ -131,7 +134,7 @@ int main(int argc, char **argv) } if (size != 0) { - printf("Writing this size: %d\n", size); + printf("Writing this size: %zu\n", size); } else { errno = EINVAL; perror("size not found"); -- cgit v1.2.3 From 1aa1dd9cc595917882fb6db67725442956f79607 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:38 +0800 Subject: selftests/mm/charge_reserved_hugetlb: drop mount size for hugetlbfs charge_reserved_hugetlb.sh mounts a hugetlbfs instance at /mnt/huge with a fixed size of 256M. On systems with large base hugepages (e.g. 512MB), this is smaller than a single hugepage, so the hugetlbfs mount ends up with zero capacity (often visible as size=0 in mount output). As a result, write_to_hugetlbfs fails with ENOMEM and the test can hang waiting for progress. === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 ... # mount |grep /mnt/huge none on /mnt/huge type hugetlbfs (rw,relatime,seclabel,pagesize=512M,size=0) # grep -i huge /proc/meminfo ... HugePages_Total: 10 HugePages_Free: 10 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 524288 kB Hugetlb: 5242880 kB Drop the mount args with 'size=256M', so the filesystem capacity is sufficient regardless of HugeTLB page size. Link: https://lkml.kernel.org/r/20251221122639.3168038-3-liwang@redhat.com Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests") Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Acked-by: Waiman Long Cc: Mark Brown Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index e1fe16bcbbe8..fa6713892d82 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -290,7 +290,7 @@ function run_test() { setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ @@ -344,7 +344,7 @@ function run_multiple_cgroup_test() { setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ -- cgit v1.2.3 From b618876f2e7055160cc5b98b4ff5cd8917e7b49e Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:39 +0800 Subject: selftests/mm/charge_reserved_hugetlb.sh: add waits with timeout helper The hugetlb cgroup usage wait loops in charge_reserved_hugetlb.sh were unbounded and could hang forever if the expected cgroup file value never appears (e.g. due to write_to_hugetlbfs in Error mapping). === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k # ls /sys/kernel/mm/hugepages/hugepages-* hugepages-16777216kB/ hugepages-2048kB/ hugepages-524288kB/ #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 ... Introduce a small helper, wait_for_file_value(), and use it for: - waiting for reservation usage to drop to 0, - waiting for reservation usage to reach a given size, - waiting for fault usage to reach a given size. This makes the waits consistent and adds a hard timeout (60 tries with 1s sleep) so the test fails instead of stalling indefinitely. Link: https://lkml.kernel.org/r/20251221122639.3168038-4-liwang@redhat.com Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Cc: Mark Brown Cc: Shuah Khan Cc: Waiman Long Signed-off-by: Andrew Morton --- .../selftests/mm/charge_reserved_hugetlb.sh | 51 +++++++++++++--------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index fa6713892d82..447769657634 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -100,7 +100,7 @@ function setup_cgroup() { echo writing cgroup limit: "$cgroup_limit" echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - echo writing reseravation limit: "$reservation_limit" + echo writing reservation limit: "$reservation_limit" echo "$reservation_limit" > \ $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file @@ -112,41 +112,50 @@ function setup_cgroup() { fi } +function wait_for_file_value() { + local path="$1" + local expect="$2" + local max_tries=60 + + if [[ ! -r "$path" ]]; then + echo "ERROR: cannot read '$path', missing or permission denied" + return 1 + fi + + for ((i=1; i<=max_tries; i++)); do + local cur="$(cat "$path")" + if [[ "$cur" == "$expect" ]]; then + return 0 + fi + echo "Waiting for $path to become '$expect' (current: '$cur') (try $i/$max_tries)" + sleep 1 + done + + echo "ERROR: timeout waiting for $path to become '$expect'" + return 1 +} + function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "0" } function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function write_hugetlbfs_and_get_usage() { -- cgit v1.2.3 From b47beff129c6193df3dd406f2db2628fcc09d1eb Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:21 +0800 Subject: selftests/mm: fix va_high_addr_switch.sh return value Patch series "Fix va_high_addr_switch.sh test failure - again", v2. The series address several issues exist for the va_high_addr_switch test: 1) the test return value is ignored in va_high_addr_switch.sh. 2) the va_high_addr_switch test requires 6 hugepages not 5. 3) the reurn value of the first test in va_high_addr_switch.c can be overridden by the second test. 4) the nr_hugepages setup in run_vmtests.sh for arm64 can be done in va_high_addr_switch.sh too. 5) update a comment for check_test_requirements. This patch: (of 5) The return value should be return value of va_high_addr_switch, otherwise a test failure would be silently ignored. Link: https://lkml.kernel.org/r/20251221040025.3159990-1-chuhu@redhat.com Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a7d4b02b21dd..f89fe078a8e6 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -114,4 +114,6 @@ save_nr_hugepages # 4 keep_mapped pages, and one for tmp usage setup_nr_hugepages 5 ./va_high_addr_switch --run-hugetlb +retcode=$? restore_nr_hugepages +exit $retcode -- cgit v1.2.3 From b1f031e33cb5ae4be039a17613ad8da84c777e70 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:22 +0800 Subject: selftests/mm: allocate 6 hugepages in va_high_addr_switch.sh The va_high_addr_switch test requires 6 hugepages, not 5. If running the test directly by: ./va_high_addr_switch.sh, the test will hit a mmap 'FAIL' caused by not enough hugepages: mmap(addr_switch_hint - hugepagesize, 2*hugepagesize, MAP_HUGETLB): 0x7f330f800000 - OK mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB): 0xffffffffffffffff - FAILED The failure can't be hit if run the tests by running 'run_vmtests.sh -t hugevm' because the nr_hugepages is set to 128 at the beginning of run_vmtests.sh and va_high_addr_switch.sh skip the setup of nr_hugepages because already enough. Link: https://lkml.kernel.org/r/20251221040025.3159990-2-chuhu@redhat.com Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index f89fe078a8e6..a0c93d348b11 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -111,8 +111,8 @@ setup_nr_hugepages() check_test_requirements save_nr_hugepages -# 4 keep_mapped pages, and one for tmp usage -setup_nr_hugepages 5 +# The HugeTLB tests require 6 pages +setup_nr_hugepages 6 ./va_high_addr_switch --run-hugetlb retcode=$? restore_nr_hugepages -- cgit v1.2.3 From 7544d7969d84c1c4a078d1c5a7d4117fbf6f385c Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:23 +0800 Subject: selftests/mm: remove arm64 nr_hugepages setup for va_high_addr_switch test arm64 and x86_64 has the same nr_hugepages requriement for running the va_high_addr_switch test. Since commit d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test"), the setup can be done in va_high_addr_switch.sh. So remove the duplicated setup. Link: https://lkml.kernel.org/r/20251221040025.3159990-3-chuhu@redhat.com Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 8 -------- 1 file changed, 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d9173f2312b7..2dadbfc6e535 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -412,15 +412,7 @@ if [ $VADDR64 -ne 0 ]; then fi # va high address boundary switch test - ARCH_ARM64="arm64" - prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo 6 > /proc/sys/vm/nr_hugepages - fi CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages - fi fi # VADDR64 # vmalloc stability smoke test -- cgit v1.2.3 From dd0202a0bd81c33096f3d473c296cad997baba5b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:24 +0800 Subject: selftests/mm: va_high_addr_switch return fail when either test failed When the first test failed, and the hugetlb test passed, the result would be pass, but we expect a fail. Fix this issue by returning fail if either is not KSFT_PASS. Link: https://lkml.kernel.org/r/20251221040025.3159990-4-chuhu@redhat.com Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 02f290a69132..51401e081b20 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -322,7 +322,7 @@ static int supported_arch(void) int main(int argc, char **argv) { - int ret; + int ret, hugetlb_ret = KSFT_PASS; if (!supported_arch()) return KSFT_SKIP; @@ -331,6 +331,10 @@ int main(int argc, char **argv) ret = run_test(testcases, sz_testcases); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); - return ret; + hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); + + if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS) + return KSFT_PASS; + else + return KSFT_FAIL; } -- cgit v1.2.3 From 6319c4f44234c3849fdb2c3f72c45353aa428d3f Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:25 +0800 Subject: selftests/mm: fix comment for check_test_requirements The test supports arm64 as well so the comment is incorrect. And there's a check for arm64 in va_high_addr_switch.c. Link: https://lkml.kernel.org/r/20251221040025.3159990-5-chuhu@redhat.com Fixes: 983e760bcdb6 ("selftest/mm: va_high_addr_switch: add ppc64 support check") Fixes: f556acc2facd ("selftests/mm: skip test for non-LPA2 and non-LVA systems") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a0c93d348b11..9492c2d72634 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -61,9 +61,9 @@ check_supported_ppc64() check_test_requirements() { - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. + # The test supports x86_64, powerpc64 and arm64. There's check for arm64 + # in va_high_addr_switch.c. The test itself will reject other architectures. + case `uname -m` in "x86_64") check_supported_x86_64 -- cgit v1.2.3 From e700f5d1560798aacf0e56fdcc70ee2c20bf56ec Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 16 Dec 2025 02:45:21 -0500 Subject: watchdog: softlockup: panic when lockup duration exceeds N thresholds The softlockup_panic sysctl is currently a binary option: panic immediately or never panic on soft lockups. Panicking on any soft lockup, regardless of duration, can be overly aggressive for brief stalls that may be caused by legitimate operations. Conversely, never panicking may allow severe system hangs to persist undetected. Extend softlockup_panic to accept an integer threshold, allowing the kernel to panic only when the normalized lockup duration exceeds N watchdog threshold periods. This provides finer-grained control to distinguish between transient delays and persistent system failures. The accepted values are: - 0: Don't panic (unchanged) - 1: Panic when duration >= 1 * threshold (20s default, original behavior) - N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.) The original behavior is preserved for values 0 and 1, maintaining full backward compatibility while allowing systems to tolerate brief lockups while still catching severe, persistent hangs. [lirongqing@baidu.com: v2] Link: https://lkml.kernel.org/r/20251218074300.4080-1-lirongqing@baidu.com Link: https://lkml.kernel.org/r/20251216074521.2796-1-lirongqing@baidu.com Signed-off-by: Li RongQing Cc: Eduard Zingerman Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Lance Yang Cc: Martin KaFai Lau Cc: Nicholas Piggin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 10 +++++----- arch/arm/configs/aspeed_g5_defconfig | 2 +- arch/arm/configs/pxa3xx_defconfig | 2 +- arch/openrisc/configs/or1klitex_defconfig | 2 +- arch/powerpc/configs/skiroot_defconfig | 2 +- drivers/gpu/drm/ci/arm.config | 2 +- drivers/gpu/drm/ci/arm64.config | 2 +- drivers/gpu/drm/ci/x86_64.config | 2 +- kernel/configs/debug.config | 2 +- kernel/watchdog.c | 10 ++++++---- lib/Kconfig.debug | 13 +++++++------ tools/testing/selftests/bpf/config | 2 +- tools/testing/selftests/wireguard/qemu/kernel.config | 2 +- 13 files changed, 28 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1058f2a6d6a8..73d846211144 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6969,12 +6969,12 @@ Kernel parameters softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: 0 | 1 + Format: - A value of 1 instructs the soft-lockup detector - to panic the machine when a soft-lockup occurs. It is - also controlled by the kernel.softlockup_panic sysctl - and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the + A value of non-zero instructs the soft-lockup detector + to panic the machine when a soft-lockup duration exceeds + N thresholds. It is also controlled by the kernel.softlockup_panic + sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the respective build-time switch to that functionality. softlockup_all_cpu_backtrace= diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 2e6ea13c1e9b..ec558e57d081 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig index 07d422f0ff34..fb272e3a2337 100644 --- a/arch/arm/configs/pxa3xx_defconfig +++ b/arch/arm/configs/pxa3xx_defconfig @@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SHIRQ=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 # CONFIG_SCHED_DEBUG is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig index fb1eb9a68bd6..984b0e3b2768 100644 --- a/arch/openrisc/configs/or1klitex_defconfig +++ b/arch/openrisc/configs/or1klitex_defconfig @@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf" CONFIG_PRINTK_TIME=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BUG_ON_DATA_CORRUPTION=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 2b71a6dc399e..a4114fca5a39 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config index 411e814819a8..d7c51670da2f 100644 --- a/drivers/gpu/drm/ci/arm.config +++ b/drivers/gpu/drm/ci/arm.config @@ -52,7 +52,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=n -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_FW_LOADER_COMPRESS=y diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config index fddfbd4d2493..ea0e30737c4d 100644 --- a/drivers/gpu/drm/ci/arm64.config +++ b/drivers/gpu/drm/ci/arm64.config @@ -161,7 +161,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_DETECT_HUNG_TASK=y diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config index 8eaba388b141..7ac98a78691e 100644 --- a/drivers/gpu/drm/ci/x86_64.config +++ b/drivers/gpu/drm/ci/x86_64.config @@ -47,7 +47,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 9f6ab7dabf67..774702591d26 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y # Debug Oops, Lockups and Hangs # CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PANIC_ON_OOPS=y diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 366122f4a0f8..b4d5fbdb933a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); - int duration; int softlockup_all_cpu_backtrace; + int duration, thresh_count; unsigned long flags; if (!watchdog_enabled) @@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); - if (softlockup_panic) + thresh_count = duration / get_softlockup_thresh(); + + if (softlockup_panic && thresh_count >= softlockup_panic) panic("softlockup: hung tasks"); } @@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "softlockup_sys_info", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4bfca37f313e..947e62e92da8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM the CPU stats and the interrupt counts during the "soft lockups". config BOOTPARAM_SOFTLOCKUP_PANIC - bool "Panic (Reboot) On Soft Lockups" + int "Panic (Reboot) On Soft Lockups" depends on SOFTLOCKUP_DETECTOR + default 0 help - Say Y here to enable the kernel to panic on "soft lockups", - which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. + Set to a non-zero value N to enable the kernel to panic on "soft + lockups", which are bugs that cause the kernel to loop in kernel + mode for more than (N * 20 seconds) (configurable using the + watchdog_thresh sysctl), without giving other tasks a chance to run. The panic can be used in combination with panic_timeout, to cause the system to reboot automatically after a @@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC high-availability systems that have uptime guarantees and where a lockup must be resolved ASAP. - Say N if unsure. + Say 0 if unsure. config HAVE_HARDLOCKUP_DETECTOR_BUDDY bool diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 558839e3c185..24855381290d 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -1,6 +1,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BPF=y CONFIG_BPF_EVENTS=y CONFIG_BPF_JIT=y diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 0504c11c2de6..bb89d2dfaa2a 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y CONFIG_DETECT_HUNG_TASK=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_PANIC_TIMEOUT=-1 CONFIG_STACKTRACE=y -- cgit v1.2.3 From 4fca95095cdcd81bd4a8c8c7008fb3c175a3a5d5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 20 Jan 2026 15:05:55 +0800 Subject: selftests/bpf: test the jited inline of bpf_get_current_task Add the testcase for the jited inline of bpf_get_current_task(). Signed-off-by: Menglong Dong Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120070555.233486-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_jit_inline.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_inline.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index b6a1e79709be..302286a80154 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -112,6 +112,7 @@ #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" #include "verifier_lsm.skel.h" +#include "verifier_jit_inline.skel.h" #include "irq.skel.h" #define MAX_ENTRIES 11 @@ -255,6 +256,7 @@ void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } +void test_verifier_jit_inline(void) { RUN(verifier_jit_inline); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c new file mode 100644 index 000000000000..4ea254063646 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("fentry/bpf_fentry_test1") +__success __retval(0) +__arch_x86_64 +__jited(" addq %gs:{{.*}}, %rax") +__arch_arm64 +__jited(" mrs x7, SP_EL0") +int inline_bpf_get_current_task(void) +{ + bpf_get_current_task(); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 9a0a5b5ac4372da84394dc329f763d6b7d384a86 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 20 Jan 2026 18:26:44 -0300 Subject: perf list: Signal changing const memory is ok In this case its a temp list that is created just for listing events and will be deleted at the end, so just cast it to get rid of the compiler warning. Reviewed-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 8f3ed83853a9..4bbcdbf05b84 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -86,7 +86,7 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state) strlist__for_each_entry(sdt_name, sdtlist) { bool show_detail = false; - char *bid = strchr(sdt_name->s, '@'); + char *bid = (char *)strchr(sdt_name->s, '@'); char *evt_name = NULL; if (bid) -- cgit v1.2.3 From 29132d16965e66fed0bf7b38242e7e57df294ba0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 20 Jan 2026 18:16:09 -0300 Subject: perf list: Don't write to const memory Something now detected on fedora 44, where strchr() returns const if it is passed a const pointer: util/print-events.c: In function 'print_sdt_events': util/print-events.c:89:29: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 89 | char *bid = strchr(sdt_name->s, '@'); | ^~~~~~ Fix it by using strchrnul() + strncmp() instead of temporarily scrubbing it with '\0'. Reviewed-by: Ian Rogers Suggested-by: David Laight Link: https://lore.kernel.org/r/20260121112536.27fd5d11@pumpkin Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print-events.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 4bbcdbf05b84..cb27e2898aa0 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -97,14 +97,9 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state) } else { next_sdt_name = strlist__next(sdt_name); if (next_sdt_name) { - char *bid2 = strchr(next_sdt_name->s, '@'); - - if (bid2) - *bid2 = '\0'; - if (strcmp(sdt_name->s, next_sdt_name->s) == 0) - show_detail = true; - if (bid2) - *bid2 = '@'; + const char *bid2 = strchrnul(next_sdt_name->s, '@'); + + show_detail = strncmp(sdt_name->s, next_sdt_name->s, bid2 - next_sdt_name->s) == 0; } } last_sdt_name = sdt_name->s; -- cgit v1.2.3 From 75aad5ffe099a1b1a342257236dc260493917ed2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:00 +0800 Subject: selftests/ublk: fix IO thread idle check Include cmd_inflight in ublk_thread_is_done() check. Without this, the thread may exit before all FETCH commands are completed, which may cause device deletion to hang. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 185ba553686a..f52431fe9b6c 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -753,7 +753,7 @@ static int ublk_thread_is_idle(struct ublk_thread *t) static int ublk_thread_is_done(struct ublk_thread *t) { - return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t); + return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight; } static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, -- cgit v1.2.3 From 23e62cf75518825aac12e9a22bdc40f062428898 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:01 +0800 Subject: selftests/ublk: fix error handling for starting device Fix error handling in ublk_start_daemon() when start_dev fails: 1. Call ublk_ctrl_stop_dev() to cancel inflight uring_cmd before cleanup. Without this, the device deletion may hang waiting for I/O completion that will never happen. 2. Add fail_start label so that pthread_join() is called on the error path. This ensures proper thread cleanup when startup fails. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f52431fe9b6c..65f59e7b6972 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1054,7 +1054,9 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) } if (ret < 0) { ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); - goto fail; + /* stop device so that inflight uring_cmd can be cancelled */ + ublk_ctrl_stop_dev(dev); + goto fail_start; } ublk_ctrl_get_info(dev); @@ -1062,7 +1064,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_ctrl_dump(dev); else ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); - +fail_start: /* wait until we are terminated */ for (i = 0; i < dev->nthreads; i++) pthread_join(tinfo[i].thread, &thread_ret); -- cgit v1.2.3 From e7e1cc18f120a415646be12470169a978a1adcd9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:02 +0800 Subject: selftests/ublk: fix garbage output in foreground mode Initialize _evtfd to -1 in struct dev_ctx to prevent garbage output when running kublk in foreground mode. Without this, _evtfd is zero-initialized to 0 (stdin), and ublk_send_dev_event() writes binary data to stdin which appears as garbage on the terminal. Also fix debug message format string. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 65f59e7b6972..f197ad9cc262 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1274,7 +1274,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) } ret = ublk_start_daemon(ctx, dev); - ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); if (ret < 0) ublk_ctrl_del_dev(dev); @@ -1620,6 +1620,7 @@ int main(int argc, char *argv[]) int option_idx, opt; const char *cmd = argv[1]; struct dev_ctx ctx = { + ._evtfd = -1, .queue_depth = 128, .nr_hw_queues = 2, .dev_id = -1, -- cgit v1.2.3 From 73061dbeca783aaf311e1af9610f8cba1c1176cd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 21:11:44 +0000 Subject: selftests/io_uring: add io_uring_queue_init_params Add a ring init variant taking struct io_uring_params, which mimicks liburing API. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- tools/include/io_uring/mini_liburing.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/include/io_uring/mini_liburing.h b/tools/include/io_uring/mini_liburing.h index 9ccb16074eb5..a55407b09dbb 100644 --- a/tools/include/io_uring/mini_liburing.h +++ b/tools/include/io_uring/mini_liburing.h @@ -126,21 +126,18 @@ static inline int io_uring_enter(int fd, unsigned int to_submit, flags, sig, _NSIG / 8); } -static inline int io_uring_queue_init(unsigned int entries, - struct io_uring *ring, - unsigned int flags) +static inline int io_uring_queue_init_params(unsigned int entries, + struct io_uring *ring, + struct io_uring_params *p) { - struct io_uring_params p; int fd, ret; memset(ring, 0, sizeof(*ring)); - memset(&p, 0, sizeof(p)); - p.flags = flags; - fd = io_uring_setup(entries, &p); + fd = io_uring_setup(entries, p); if (fd < 0) return fd; - ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); + ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); if (!ret) ring->ring_fd = fd; else @@ -148,6 +145,18 @@ static inline int io_uring_queue_init(unsigned int entries, return ret; } +static inline int io_uring_queue_init(unsigned int entries, + struct io_uring *ring, + unsigned int flags) +{ + struct io_uring_params p; + + memset(&p, 0, sizeof(p)); + p.flags = flags; + + return io_uring_queue_init_params(entries, ring, &p); +} + /* Get a sqe */ static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) { -- cgit v1.2.3 From 145e0074392587606aa5df353d0e761f0b8357d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 21:11:45 +0000 Subject: selftests/io_uring: support NO_SQARRAY in miniliburing Add support for IORING_SETUP_NO_SQARRAY in miniliburing. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- tools/include/io_uring/mini_liburing.h | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/include/io_uring/mini_liburing.h b/tools/include/io_uring/mini_liburing.h index a55407b09dbb..44be4446feda 100644 --- a/tools/include/io_uring/mini_liburing.h +++ b/tools/include/io_uring/mini_liburing.h @@ -6,6 +6,7 @@ #include #include #include +#include struct io_sq_ring { unsigned int *head; @@ -55,6 +56,7 @@ struct io_uring { struct io_uring_sq sq; struct io_uring_cq cq; int ring_fd; + unsigned flags; }; #if defined(__x86_64) || defined(__i386__) @@ -72,7 +74,14 @@ static inline int io_uring_mmap(int fd, struct io_uring_params *p, void *ptr; int ret; - sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned int); + if (p->flags & IORING_SETUP_NO_SQARRAY) { + sq->ring_sz = p->cq_off.cqes; + sq->ring_sz += p->cq_entries * sizeof(struct io_uring_cqe); + } else { + sq->ring_sz = p->sq_off.array; + sq->ring_sz += p->sq_entries * sizeof(unsigned int); + } + ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); if (ptr == MAP_FAILED) @@ -83,7 +92,8 @@ static inline int io_uring_mmap(int fd, struct io_uring_params *p, sq->kring_entries = ptr + p->sq_off.ring_entries; sq->kflags = ptr + p->sq_off.flags; sq->kdropped = ptr + p->sq_off.dropped; - sq->array = ptr + p->sq_off.array; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + sq->array = ptr + p->sq_off.array; size = p->sq_entries * sizeof(struct io_uring_sqe); sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, @@ -138,10 +148,12 @@ static inline int io_uring_queue_init_params(unsigned int entries, if (fd < 0) return fd; ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); - if (!ret) + if (!ret) { ring->ring_fd = fd; - else + ring->flags = p->flags; + } else { close(fd); + } return ret; } @@ -208,10 +220,18 @@ static inline int io_uring_submit(struct io_uring *ring) ktail = *sq->ktail; to_submit = sq->sqe_tail - sq->sqe_head; - for (submitted = 0; submitted < to_submit; submitted++) { - read_barrier(); - sq->array[ktail++ & mask] = sq->sqe_head++ & mask; + + if (!(ring->flags & IORING_SETUP_NO_SQARRAY)) { + for (submitted = 0; submitted < to_submit; submitted++) { + read_barrier(); + sq->array[ktail++ & mask] = sq->sqe_head++ & mask; + } + } else { + ktail += to_submit; + sq->sqe_head += to_submit; + submitted = to_submit; } + if (!submitted) return 0; -- cgit v1.2.3 From 1ed797764315496a115cd0568450a7f72da80df6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 21 Jan 2026 12:43:48 +0800 Subject: selftests/bpf: test bpf_get_func_arg() for tp_btf Test bpf_get_func_arg() and bpf_get_func_arg_cnt() for tp_btf. The code is most copied from test1 and test2. Signed-off-by: Menglong Dong Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260121044348.113201-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/get_func_args_test.c | 3 ++ .../selftests/bpf/progs/get_func_args_test.c | 44 ++++++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod-events.h | 10 +++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 4 ++ 4 files changed, 61 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index 64a9c95d4acf..fadee95d3ae8 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -33,11 +33,14 @@ void test_get_func_args_test(void) ASSERT_EQ(topts.retval >> 16, 1, "test_run"); ASSERT_EQ(topts.retval & 0xffff, 1234 + 29, "test_run"); + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); + ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index e0f34a55e697..5b7233afef05 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -121,3 +121,47 @@ int BPF_PROG(fexit_test, int _a, int *_b, int _ret) test4_result &= err == 0 && ret == 1234; return 0; } + +__u64 test5_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test1_tp") +int BPF_PROG(tp_test1) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0; + __s64 err; + + test5_result = cnt == 1; + + err = bpf_get_func_arg(ctx, 0, &a); + test5_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test5_result &= err == -EINVAL; + + return 0; +} + +__u64 test6_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test2_tp") +int BPF_PROG(tp_test2) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, b = 0, z = 0; + __s64 err; + + test6_result = cnt == 2; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test6_result &= err == 0 && (int) a == 2; + + err = bpf_get_func_arg(ctx, 1, &b); + test6_result &= err == 0 && b == 3; + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 2, &z); + test6_result &= err == -EINVAL; + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h index aeef86b3da74..45a5e41f3a92 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h @@ -63,6 +63,16 @@ BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare, sizeof(struct bpf_testmod_test_writable_ctx) ); +DECLARE_TRACE(bpf_testmod_fentry_test1, + TP_PROTO(int a), + TP_ARGS(a) +); + +DECLARE_TRACE(bpf_testmod_fentry_test2, + TP_PROTO(int a, u64 b), + TP_ARGS(a, b) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index d425034b72d3..77a81fa8ec6a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -412,11 +412,15 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) noinline int bpf_testmod_fentry_test1(int a) { + trace_bpf_testmod_fentry_test1_tp(a); + return a + 1; } noinline int bpf_testmod_fentry_test2(int a, u64 b) { + trace_bpf_testmod_fentry_test2_tp(a, b); + return a + b; } -- cgit v1.2.3 From f4924ad0b13fd4ca4f0c7117dc143bf372224aec Mon Sep 17 00:00:00 2001 From: Yuzuki Ishiyama Date: Wed, 21 Jan 2026 12:33:28 +0900 Subject: selftests/bpf: Test kfunc bpf_strncasecmp Add testsuites for kfunc bpf_strncasecmp. Signed-off-by: Yuzuki Ishiyama Acked-by: Viktor Malik Link: https://lore.kernel.org/r/20260121033328.1850010-3-ishiyama@hpc.is.uec.ac.jp Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/string_kfuncs.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c | 6 ++++++ tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_success.c | 7 +++++++ 4 files changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 0f3bf594e7a5..300032a19445 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -9,6 +9,7 @@ static const char * const test_cases[] = { "strcmp", "strcasecmp", + "strncasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 826e6b6aff7e..bddc4e8579d2 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -33,6 +33,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null1(void *ctx) { return bpf_strncasecmp(NULL, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null2(void *ctx) { return bpf_strncasecmp("HELLO", NULL, 5); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -57,6 +59,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { ret SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr1(void *ctx) { return bpf_strncasecmp(user_ptr, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr2(void *ctx) { return bpf_strncasecmp("HELLO", user_ptr, 5); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -83,6 +87,8 @@ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault1(void *ctx) { return bpf_strncasecmp(invalid_kern_ptr, "HELLO", 5); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault2(void *ctx) { return bpf_strncasecmp("HELLO", invalid_kern_ptr, 5); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 05e1da1f250f..412c53b87b18 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -8,6 +8,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } +SEC("syscall") int test_strncasecmp_too_long(void *ctx) { return bpf_strncasecmp(long_str, long_str, sizeof(long_str)); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index a8513964516b..f65b1226a81a 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -17,6 +17,13 @@ __test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO __test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } __test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } __test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } +__test(0) int test_strncasecmp_eq1(void *ctx) { return bpf_strncasecmp(str, "hello world", 11); } +__test(0) int test_strncasecmp_eq2(void *ctx) { return bpf_strncasecmp(str, "HELLO WORLD", 11); } +__test(0) int test_strncasecmp_eq3(void *ctx) { return bpf_strncasecmp(str, "HELLO world", 11); } +__test(0) int test_strncasecmp_eq4(void *ctx) { return bpf_strncasecmp(str, "hello", 5); } +__test(0) int test_strncasecmp_eq5(void *ctx) { return bpf_strncasecmp(str, "hello world!", 11); } +__test(-1) int test_strncasecmp_neq1(void *ctx) { return bpf_strncasecmp(str, "hello!", 6); } +__test(1) int test_strncasecmp_neq2(void *ctx) { return bpf_strncasecmp(str, "abc", 3); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } -- cgit v1.2.3 From 6d6ad32e22f028c525d5df471c5522616e645a6b Mon Sep 17 00:00:00 2001 From: Ziyu Chen Date: Wed, 21 Jan 2026 17:41:47 +0800 Subject: selftests/pidfd: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the typo "untill" → "until" in a comment in pidfd_info_test.c. This typo is already listed in scripts/spelling.txt by commit 66b47b4a9dad ("checkpatch: look for common misspellings"). Link: https://lore.kernel.org/r/20260121094147.4187337-1-chenziyu@uniontech.com Suggested-by: Cryolitia PukNgae Signed-off-by: Ziyu Chen Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_info_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index 6571e04acd88..8bed951e06a0 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -229,7 +229,7 @@ static void *pidfd_info_pause_thread(void *arg) close(ipc_socket); - /* Sleep untill we're killed. */ + /* Sleep until we're killed. */ pause(); return NULL; } -- cgit v1.2.3 From 3f2de814c0597c97d5abe09a1635d8c4e2fddaf2 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 21 Jan 2026 11:25:32 -0500 Subject: objtool: Fix libopcodes linking with static libraries Commit 436326bc525d ("objtool: fix build failure due to missing libopcodes check") tests for libopcodes using an empty main(), which passes even when static libraries lack their dependencies. This causes undefined reference errors (xmalloc, bfd_get_bits, etc.) when linking against static libopcodes without its required libbfd and libiberty. Fix by testing with an actual libopcodes symbol and trying increasingly complete library combinations until one succeeds. Fixes: 436326bc525d ("objtool: fix build failure due to missing libopcodes check") Reported-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Tested-by: Rafael J. Wysocki (Intel) Link: https://patch.msgid.link/20260121162532.1596238-1-sashal@kernel.org --- tools/objtool/Makefile | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 9b4503113ce5..a40f30232929 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -77,8 +77,21 @@ HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" # We check using HOSTCC directly rather than the shared feature framework # because objtool is a host tool that links against host libraries. # -HAVE_LIBOPCODES := $(shell echo 'int main(void) { return 0; }' | \ - $(HOSTCC) -xc - -o /dev/null -lopcodes 2>/dev/null && echo y) +# When using shared libraries, -lopcodes is sufficient as dependencies are +# resolved automatically. With static libraries, we must explicitly link +# against libopcodes' dependencies: libbfd, libiberty, and sometimes libz. +# Try each combination and use the first one that succeeds. +# +LIBOPCODES_LIBS := $(shell \ + for libs in "-lopcodes" \ + "-lopcodes -lbfd" \ + "-lopcodes -lbfd -liberty" \ + "-lopcodes -lbfd -liberty -lz"; do \ + echo 'extern void disassemble_init_for_target(void *);' \ + 'int main(void) { disassemble_init_for_target(0); return 0; }' | \ + $(HOSTCC) -xc - -o /dev/null $$libs 2>/dev/null && \ + echo "$$libs" && break; \ + done) # Styled disassembler support requires binutils >= 2.39 HAVE_DISASM_STYLED := $(shell echo '$(pound)include ' | \ @@ -86,10 +99,10 @@ HAVE_DISASM_STYLED := $(shell echo '$(pound)include ' | \ BUILD_DISAS := n -ifeq ($(HAVE_LIBOPCODES),y) +ifneq ($(LIBOPCODES_LIBS),) BUILD_DISAS := y OBJTOOL_CFLAGS += -DDISAS -DPACKAGE='"objtool"' - OBJTOOL_LDFLAGS += -lopcodes + OBJTOOL_LDFLAGS += $(LIBOPCODES_LIBS) ifeq ($(HAVE_DISASM_STYLED),y) OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED endif -- cgit v1.2.3 From a32ae2658471dd87a2f7a438388ed7d9a5767212 Mon Sep 17 00:00:00 2001 From: Kery Qi Date: Wed, 21 Jan 2026 17:41:16 +0800 Subject: selftests/bpf: Fix resource leak in serial_test_wq on attach failure When wq__attach() fails, serial_test_wq() returns early without calling wq__destroy(), leaking the skeleton resources allocated by wq__open_and_load(). This causes ASAN leak reports in selftests runs. Fix this by jumping to a common clean_up label that calls wq__destroy() on all exit paths after successful open_and_load. Note that the early return after wq__open_and_load() failure is correct and doesn't need fixing, since that function returns NULL on failure (after internally cleaning up any partial allocations). Fixes: 8290dba51910 ("selftests/bpf: wq: add bpf_wq_start() checks") Signed-off-by: Kery Qi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20260121094114.1801-3-qikeyu2017@gmail.com --- tools/testing/selftests/bpf/prog_tests/wq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 15c67d23128b..84831eecc935 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -16,12 +16,12 @@ void serial_test_wq(void) /* re-run the success test to check if the timer was actually executed */ wq_skel = wq__open_and_load(); - if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + if (!ASSERT_OK_PTR(wq_skel, "wq__open_and_load")) return; err = wq__attach(wq_skel); if (!ASSERT_OK(err, "wq_attach")) - return; + goto clean_up; prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); err = bpf_prog_test_run_opts(prog_fd, &topts); @@ -31,6 +31,7 @@ void serial_test_wq(void) usleep(50); /* 10 usecs should be enough, but give it extra */ ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); +clean_up: wq__destroy(wq_skel); } -- cgit v1.2.3 From cb68cba4453d3e021b27c2a08fcefdd1376a5ef0 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:23 +0000 Subject: tools/lib: Add list_is_first() Add list_is_first() to check whether @list is the first entry in list @head Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Acked-by: Ian Rogers Acked-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/list.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/list.h b/tools/include/linux/list.h index a4dfb6a7cc6a..a692ff7aed5c 100644 --- a/tools/include/linux/list.h +++ b/tools/include/linux/list.h @@ -169,6 +169,16 @@ static inline void list_move_tail(struct list_head *list, list_add_tail(list, head); } +/** + * list_is_first -- tests whether @list is the first entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_first(const struct list_head *list, const struct list_head *head) +{ + return list->prev == head; +} + /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test -- cgit v1.2.3 From d40c68a49f69c9bdb4ca14b3e6a0422bbaeb5d8f Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:24 +0000 Subject: perf header: Support CPU DOMAIN relation info The '/proc/schedstat' file gives info about load balancing statistics within a given domain. It also contains the cpu_mask giving information about the sibling cpus and domain names after schedstat version 17. Storing this information in perf header will help tools like `perf sched stats` for better analysis. Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Acked-by: Ian Rogers Acked-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf.data-file-format.txt | 17 ++ tools/perf/builtin-inject.c | 1 + tools/perf/util/env.c | 29 +++ tools/perf/util/env.h | 17 ++ tools/perf/util/header.c | 286 +++++++++++++++++++++ tools/perf/util/header.h | 1 + tools/perf/util/util.c | 42 +++ tools/perf/util/util.h | 3 + 8 files changed, 396 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index c9d4dec65344..0e4d0ecc9e12 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -447,6 +447,23 @@ struct { } [nr_pmu]; }; + HEADER_CPU_DOMAIN_INFO = 32, + +List of cpu-domain relation info. The format of the data is as below. + +struct domain_info { + int domain; + char dname[]; + char cpumask[]; + char cpulist[]; +}; + +struct cpu_domain_info { + int cpu; + int nr_domains; + struct domain_info domains[]; +}; + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index e2a653280e1b..c89ac85ec112 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2133,6 +2133,7 @@ static bool keep_feat(struct perf_inject *inject, int feat) case HEADER_CLOCK_DATA: case HEADER_HYBRID_TOPOLOGY: case HEADER_PMU_CAPS: + case HEADER_CPU_DOMAIN_INFO: return true; /* Information that can be updated */ case HEADER_BUILD_ID: diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index f1626d2032cd..93d475a80f14 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -216,6 +216,34 @@ static void perf_env__purge_bpf(struct perf_env *env __maybe_unused) } #endif // HAVE_LIBBPF_SUPPORT +void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr) +{ + if (!cd_map) + return; + + for (u32 i = 0; i < nr; i++) { + if (!cd_map[i]) + continue; + + for (u32 j = 0; j < cd_map[i]->nr_domains; j++) { + struct domain_info *d_info = cd_map[i]->domains[j]; + + if (!d_info) + continue; + + if (schedstat_version >= 17) + zfree(&d_info->dname); + + zfree(&d_info->cpumask); + zfree(&d_info->cpulist); + zfree(&d_info); + } + zfree(&cd_map[i]->domains); + zfree(&cd_map[i]); + } + zfree(&cd_map); +} + void perf_env__exit(struct perf_env *env) { int i, j; @@ -265,6 +293,7 @@ void perf_env__exit(struct perf_env *env) zfree(&env->pmu_caps[i].pmu_name); } zfree(&env->pmu_caps); + free_cpu_domain_info(env->cpu_domain, env->schedstat_version, env->nr_cpus_avail); } void perf_env__init(struct perf_env *env) diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 9977b85523a8..76ba1a36e9ff 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -54,6 +54,19 @@ struct pmu_caps { char *pmu_name; }; +struct domain_info { + u32 domain; + char *dname; + char *cpumask; + char *cpulist; +}; + +struct cpu_domain_map { + u32 cpu; + u32 nr_domains; + struct domain_info **domains; +}; + typedef const char *(arch_syscalls__strerrno_t)(int err); struct perf_env { @@ -70,6 +83,8 @@ struct perf_env { unsigned int max_branches; unsigned int br_cntr_nr; unsigned int br_cntr_width; + unsigned int schedstat_version; + unsigned int max_sched_domains; int kernel_is_64_bit; int nr_cmdline; @@ -92,6 +107,7 @@ struct perf_env { char **cpu_pmu_caps; struct cpu_topology_map *cpu; struct cpu_cache_level *caches; + struct cpu_domain_map **cpu_domain; int caches_cnt; u32 comp_ratio; u32 comp_ver; @@ -151,6 +167,7 @@ struct bpf_prog_info_node; struct btf_node; int perf_env__read_core_pmu_caps(struct perf_env *env); +void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr); void perf_env__exit(struct perf_env *env); int perf_env__kernel_is_64_bit(struct perf_env *env); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index f5cad377c99e..673d53bb2a2c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1614,6 +1614,162 @@ static int write_pmu_caps(struct feat_fd *ff, return 0; } +static struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, + u32 nr) +{ + struct domain_info *domain_info; + struct cpu_domain_map **cd_map; + char dname[16], cpumask[256]; + char cpulist[1024]; + char *line = NULL; + u32 cpu, domain; + u32 dcount = 0; + size_t len; + FILE *fp; + + fp = fopen("/proc/schedstat", "r"); + if (!fp) { + pr_err("Failed to open /proc/schedstat\n"); + return NULL; + } + + cd_map = zalloc(sizeof(*cd_map) * nr); + if (!cd_map) + goto out; + + while (getline(&line, &len, fp) > 0) { + int retval; + + if (strncmp(line, "version", 7) == 0) { + retval = sscanf(line, "version %d\n", schedstat_version); + if (retval != 1) + continue; + + } else if (strncmp(line, "cpu", 3) == 0) { + retval = sscanf(line, "cpu%u %*s", &cpu); + if (retval == 1) { + cd_map[cpu] = zalloc(sizeof(*cd_map[cpu])); + if (!cd_map[cpu]) + goto out_free_line; + cd_map[cpu]->cpu = cpu; + } else + continue; + + dcount = 0; + } else if (strncmp(line, "domain", 6) == 0) { + struct domain_info **temp_domains; + + dcount++; + temp_domains = realloc(cd_map[cpu]->domains, dcount * sizeof(domain_info)); + if (!temp_domains) + goto out_free_line; + else + cd_map[cpu]->domains = temp_domains; + + domain_info = zalloc(sizeof(*domain_info)); + if (!domain_info) + goto out_free_line; + + cd_map[cpu]->domains[dcount - 1] = domain_info; + + if (*schedstat_version >= 17) { + retval = sscanf(line, "domain%u %s %s %*s", &domain, dname, + cpumask); + if (retval != 3) + continue; + + domain_info->dname = strdup(dname); + if (!domain_info->dname) + goto out_free_line; + } else { + retval = sscanf(line, "domain%u %s %*s", &domain, cpumask); + if (retval != 2) + continue; + } + + domain_info->domain = domain; + if (domain > *max_sched_domains) + *max_sched_domains = domain; + + domain_info->cpumask = strdup(cpumask); + if (!domain_info->cpumask) + goto out_free_line; + + cpumask_to_cpulist(cpumask, cpulist); + domain_info->cpulist = strdup(cpulist); + if (!domain_info->cpulist) + goto out_free_line; + + cd_map[cpu]->nr_domains = dcount; + } + } + +out_free_line: + free(line); +out: + fclose(fp); + return cd_map; +} + +static int write_cpu_domain_info(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + u32 max_sched_domains = 0, schedstat_version = 0; + struct cpu_domain_map **cd_map; + u32 i, j, nr, ret; + + nr = cpu__max_present_cpu().cpu; + + cd_map = build_cpu_domain_map(&schedstat_version, &max_sched_domains, nr); + if (!cd_map) + return -1; + + ret = do_write(ff, &schedstat_version, sizeof(u32)); + if (ret < 0) + goto out; + + max_sched_domains += 1; + ret = do_write(ff, &max_sched_domains, sizeof(u32)); + if (ret < 0) + goto out; + + for (i = 0; i < nr; i++) { + if (!cd_map[i]) + continue; + + ret = do_write(ff, &cd_map[i]->cpu, sizeof(u32)); + if (ret < 0) + goto out; + + ret = do_write(ff, &cd_map[i]->nr_domains, sizeof(u32)); + if (ret < 0) + goto out; + + for (j = 0; j < cd_map[i]->nr_domains; j++) { + ret = do_write(ff, &cd_map[i]->domains[j]->domain, sizeof(u32)); + if (ret < 0) + goto out; + if (schedstat_version >= 17) { + ret = do_write_string(ff, cd_map[i]->domains[j]->dname); + if (ret < 0) + goto out; + } + + ret = do_write_string(ff, cd_map[i]->domains[j]->cpumask); + if (ret < 0) + goto out; + + ret = do_write_string(ff, cd_map[i]->domains[j]->cpulist); + if (ret < 0) + goto out; + } + } + +out: + free_cpu_domain_info(cd_map, schedstat_version, nr); + return ret; +} + static void print_hostname(struct feat_fd *ff, FILE *fp) { fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname); @@ -2247,6 +2403,39 @@ static void print_mem_topology(struct feat_fd *ff, FILE *fp) } } +static void print_cpu_domain_info(struct feat_fd *ff, FILE *fp) +{ + struct cpu_domain_map **cd_map = ff->ph->env.cpu_domain; + u32 nr = ff->ph->env.nr_cpus_avail; + struct domain_info *d_info; + u32 i, j; + + fprintf(fp, "# schedstat version : %u\n", ff->ph->env.schedstat_version); + fprintf(fp, "# Maximum sched domains : %u\n", ff->ph->env.max_sched_domains); + + for (i = 0; i < nr; i++) { + if (!cd_map[i]) + continue; + + fprintf(fp, "# cpu : %u\n", cd_map[i]->cpu); + fprintf(fp, "# nr_domains : %u\n", cd_map[i]->nr_domains); + + for (j = 0; j < cd_map[i]->nr_domains; j++) { + d_info = cd_map[i]->domains[j]; + if (!d_info) + continue; + + fprintf(fp, "# Domain : %u\n", d_info->domain); + + if (ff->ph->env.schedstat_version >= 17) + fprintf(fp, "# Domain name : %s\n", d_info->dname); + + fprintf(fp, "# Domain cpu map : %s\n", d_info->cpumask); + fprintf(fp, "# Domain cpu list : %s\n", d_info->cpulist); + } + } +} + static int __event_process_build_id(struct perf_record_header_build_id *bev, char *filename, struct perf_session *session) @@ -3388,6 +3577,102 @@ err: return ret; } +static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused) +{ + u32 schedstat_version, max_sched_domains, cpu, domain, nr_domains; + struct perf_env *env = &ff->ph->env; + char *dname, *cpumask, *cpulist; + struct cpu_domain_map **cd_map; + struct domain_info *d_info; + u32 nra, nr, i, j; + int ret; + + nra = env->nr_cpus_avail; + nr = env->nr_cpus_online; + + cd_map = zalloc(sizeof(*cd_map) * nra); + if (!cd_map) + return -1; + + env->cpu_domain = cd_map; + + ret = do_read_u32(ff, &schedstat_version); + if (ret) + return ret; + + env->schedstat_version = schedstat_version; + + ret = do_read_u32(ff, &max_sched_domains); + if (ret) + return ret; + + env->max_sched_domains = max_sched_domains; + + for (i = 0; i < nr; i++) { + if (do_read_u32(ff, &cpu)) + return -1; + + cd_map[cpu] = zalloc(sizeof(*cd_map[cpu])); + if (!cd_map[cpu]) + return -1; + + cd_map[cpu]->cpu = cpu; + + if (do_read_u32(ff, &nr_domains)) + return -1; + + cd_map[cpu]->nr_domains = nr_domains; + + cd_map[cpu]->domains = zalloc(sizeof(*d_info) * max_sched_domains); + if (!cd_map[cpu]->domains) + return -1; + + for (j = 0; j < nr_domains; j++) { + if (do_read_u32(ff, &domain)) + return -1; + + d_info = zalloc(sizeof(*d_info)); + if (!d_info) + return -1; + + cd_map[cpu]->domains[domain] = d_info; + d_info->domain = domain; + + if (schedstat_version >= 17) { + dname = do_read_string(ff); + if (!dname) + return -1; + + d_info->dname = zalloc(strlen(dname) + 1); + if (!d_info->dname) + return -1; + + d_info->dname = strdup(dname); + } + + cpumask = do_read_string(ff); + if (!cpumask) + return -1; + + d_info->cpumask = zalloc(strlen(cpumask) + 1); + if (!d_info->cpumask) + return -1; + d_info->cpumask = strdup(cpumask); + + cpulist = do_read_string(ff); + if (!cpulist) + return -1; + + d_info->cpulist = zalloc(strlen(cpulist) + 1); + if (!d_info->cpulist) + return -1; + d_info->cpulist = strdup(cpulist); + } + } + + return ret; +} + #define FEAT_OPR(n, func, __full_only) \ [HEADER_##n] = { \ .name = __stringify(n), \ @@ -3453,6 +3738,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(CLOCK_DATA, clock_data, false), FEAT_OPN(HYBRID_TOPOLOGY, hybrid_topology, true), FEAT_OPR(PMU_CAPS, pmu_caps, false), + FEAT_OPR(CPU_DOMAIN_INFO, cpu_domain_info, true), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index c058021c3150..c62f3275a80f 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -53,6 +53,7 @@ enum { HEADER_CLOCK_DATA, HEADER_HYBRID_TOPOLOGY, HEADER_PMU_CAPS, + HEADER_CPU_DOMAIN_INFO, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 0f031eb80b4c..b87ff96a9f45 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -257,6 +257,48 @@ static int rm_rf_kcore_dir(const char *path) return 0; } +void cpumask_to_cpulist(char *cpumask, char *cpulist) +{ + int i, j, bm_size, nbits; + int len = strlen(cpumask); + unsigned long *bm; + char cpus[1024]; + + for (i = 0; i < len; i++) { + if (cpumask[i] == ',') { + for (j = i; j < len; j++) + cpumask[j] = cpumask[j + 1]; + } + } + + len = strlen(cpumask); + bm_size = (len + 15) / 16; + nbits = bm_size * 64; + if (nbits <= 0) + return; + + bm = calloc(bm_size, sizeof(unsigned long)); + if (!cpumask) + goto free_bm; + + for (i = 0; i < bm_size; i++) { + char blk[17]; + int blklen = len > 16 ? 16 : len; + + strncpy(blk, cpumask + len - blklen, blklen); + blk[blklen] = '\0'; + bm[i] = strtoul(blk, NULL, 16); + cpumask[len - blklen] = '\0'; + len = strlen(cpumask); + } + + bitmap_scnprintf(bm, nbits, cpus, sizeof(cpus)); + strcpy(cpulist, cpus); + +free_bm: + free(bm); +} + int rm_rf_perf_data(const char *path) { const char *pat[] = { diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 3423778e39a5..1572c8cf04e5 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifndef __cplusplus #include @@ -48,6 +49,8 @@ bool sysctl__nmi_watchdog_enabled(void); int perf_tip(char **strp, const char *dirpath); +void cpumask_to_cpulist(char *cpumask, char *cpulist); + #ifndef HAVE_SCHED_GETCPU_SUPPORT int sched_getcpu(void); #endif -- cgit v1.2.3 From 6ecc08329bab2c87f579cf1a8ab7799d8d88d9bc Mon Sep 17 00:00:00 2001 From: Andre Carvalho Date: Sun, 18 Jan 2026 11:00:27 +0000 Subject: selftests: netconsole: validate target resume Introduce a new netconsole selftest to validate that netconsole is able to resume a deactivated target when the low level interface comes back. The test setups the network using netdevsim, creates a netconsole target and then remove/add netdevsim in order to bring the same interfaces back. Afterwards, the test validates that the target works as expected. Targets are created via cmdline parameters to the module to ensure that we are able to resume targets that were bound by mac and interface name. Reviewed-by: Breno Leitao Signed-off-by: Andre Carvalho Tested-by: Breno Leitao Link: https://patch.msgid.link/20260118-netcons-retrigger-v11-7-4de36aebcf48@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 1 + .../selftests/drivers/net/lib/sh/lib_netcons.sh | 35 +++++- .../selftests/drivers/net/netcons_resume.sh | 124 +++++++++++++++++++++ 3 files changed, 155 insertions(+), 5 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/netcons_resume.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index f5c71d993750..3eba569b3366 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -19,6 +19,7 @@ TEST_PROGS := \ netcons_cmdline.sh \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ + netcons_resume.sh \ netcons_sysdata.sh \ netcons_torture.sh \ netpoll_basic.py \ diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index ae8abff4be40..b6093bcf2b06 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -203,19 +203,21 @@ function do_cleanup() { function cleanup_netcons() { # delete netconsole dynamic reconfiguration # do not fail if the target is already disabled - if [[ ! -d "${NETCONS_PATH}" ]] + local TARGET_PATH=${1:-${NETCONS_PATH}} + + if [[ ! -d "${TARGET_PATH}" ]] then # in some cases this is called before netcons path is created return fi - if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + if [[ $(cat "${TARGET_PATH}"/enabled) != 0 ]] then - echo 0 > "${NETCONS_PATH}"/enabled || true + echo 0 > "${TARGET_PATH}"/enabled || true fi # Remove all the keys that got created during the selftest - find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete + find "${TARGET_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry - rmdir "${NETCONS_PATH}" + rmdir "${TARGET_PATH}" } function cleanup() { @@ -377,6 +379,29 @@ function check_netconsole_module() { fi } +function wait_target_state() { + local TARGET=${1} + local STATE=${2} + local TARGET_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" + local ENABLED=0 + + if [ "${STATE}" == "enabled" ] + then + ENABLED=1 + fi + + if [ ! -d "$TARGET_PATH" ]; then + echo "FAIL: Target does not exist." >&2 + exit "${ksft_fail}" + fi + + local CHECK_CMD="grep \"$ENABLED\" \"$TARGET_PATH/enabled\"" + slowwait 2 sh -c "test -n \"\$($CHECK_CMD)\"" || { + echo "FAIL: ${TARGET} is not ${STATE}." >&2 + exit "${ksft_fail}" + } +} + # A wrapper to translate protocol version to udp version function wait_for_port() { local NAMESPACE=${1} diff --git a/tools/testing/selftests/drivers/net/netcons_resume.sh b/tools/testing/selftests/drivers/net/netcons_resume.sh new file mode 100755 index 000000000000..fc5e5e3ad3d4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_resume.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test validates that netconsole is able to resume a target that was +# deactivated when its interface was removed when the interface is brought +# back up. +# +# The test configures a netconsole target and then removes netdevsim module to +# cause the interface to disappear. Targets are configured via cmdline to ensure +# targets bound by interface name and mac address can be resumed. +# The test verifies that the target moved to disabled state before adding +# netdevsim and the interface back. +# +# Finally, the test verifies that the target is re-enabled automatically and +# the message is received on the destination interface. +# +# Author: Andre Carvalho + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +SAVED_SRCMAC="" # to be populated later +SAVED_DSTMAC="" # to be populated later + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +check_netconsole_module + +function cleanup() { + cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" + do_cleanup + rmmod netconsole +} + +function trigger_reactivation() { + # Add back low level module + modprobe netdevsim + # Recreate namespace and two interfaces + set_network + # Restore MACs + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ + address "${SAVED_DSTMAC}" + if [ "${BINDMODE}" == "mac" ]; then + ip link set dev "${SRCIF}" down + ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" + # Rename device in order to trigger target resume, as initial + # when device was recreated it didn't have correct mac address. + ip link set dev "${SRCIF}" name "${TARGET}" + fi +} + +function trigger_deactivation() { + # Start by storing mac addresses so we can be restored in reactivate + SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ + cat /sys/class/net/"$DSTIF"/address) + SAVED_SRCMAC=$(mac_get "${SRCIF}") + # Remove low level module + rmmod netdevsim +} + +trap cleanup EXIT + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + + # Create one namespace and two interfaces + set_network + + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + # Expose cmdline target in configfs + mkdir "${NETCONS_CONFIGFS}/cmdline0" + + # Target should be enabled + wait_target_state "cmdline0" "enabled" + + # Trigger deactivation by unloading netdevsim module. Target should be + # disabled. + trigger_deactivation + wait_target_state "cmdline0" "disabled" + + # Trigger reactivation by loading netdevsim, recreating the network and + # restoring mac addresses. Target should be re-enabled. + trigger_reactivation + wait_target_state "cmdline0" "enabled" + + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Cleanup & unload the module + cleanup + + echo "${BINDMODE} : Test passed" >&2 +done + +trap - EXIT +exit "${EXIT_STATUS}" -- cgit v1.2.3 From 04708606fd7bdc34b69089a4ff848ff36d7088f9 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 20 Jan 2026 13:39:30 +0000 Subject: selftests: net: amt: wait longer for connection before sending packets Both send_mcast4() and send_mcast6() use sleep 2 to wait for the tunnel connection between the gateway and the relay, and for the listener socket to be created in the LISTENER namespace. However, tests sometimes fail because packets are sent before the connection is fully established. Increase the waiting time to make the tests more reliable, and use wait_local_port_listen() to explicitly wait for the listener socket. Fixes: c08e8baea78e ("selftests: add amt interface selftest script") Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20260120133930.863845-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/amt.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index 3ef209cacb8e..663744305e52 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -73,6 +73,8 @@ # +------------------------+ #============================================================================== +source lib.sh + readonly LISTENER=$(mktemp -u listener-XXXXXXXX) readonly GATEWAY=$(mktemp -u gateway-XXXXXXXX) readonly RELAY=$(mktemp -u relay-XXXXXXXX) @@ -246,14 +248,15 @@ test_ipv6_forward() send_mcast4() { - sleep 2 + sleep 5 + wait_local_port_listen ${LISTENER} 4000 udp ip netns exec "${SOURCE}" bash -c \ 'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' & } send_mcast6() { - sleep 2 + wait_local_port_listen ${LISTENER} 6000 udp ip netns exec "${SOURCE}" bash -c \ 'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' & } -- cgit v1.2.3 From 830969e7821af377bdc1bb016929ff28c78490e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:34 +0100 Subject: selftests/rseq: Implement time slice extension test Provide an initial test case to evaluate the functionality. This needs to be extended to cover the ABI violations and expose the race condition between observing granted and arriving in rseq_slice_yield(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.320325431@linutronix.de --- tools/testing/selftests/rseq/.gitignore | 1 + tools/testing/selftests/rseq/Makefile | 5 +- tools/testing/selftests/rseq/rseq-abi.h | 27 ++++ tools/testing/selftests/rseq/slice_test.c | 219 ++++++++++++++++++++++++++++++ 4 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rseq/slice_test.c (limited to 'tools') diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 0fda241fa62b..ec01d164c1f0 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -10,3 +10,4 @@ param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice syscall_errors_test +slice_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d0a5fae5954..4ef90823b652 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test slice_test TEST_GEN_PROGS_EXTENDED = librseq.so @@ -59,3 +59,6 @@ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDE $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index fb4ec8a75dd4..ecef315204b2 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -53,6 +53,27 @@ struct rseq_abi_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_abi_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_abi_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -164,6 +185,12 @@ struct rseq_abi { */ __u32 mm_cid; + /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_abi_slice_ctrl slice_ctrl; + /* * Flexible array member at end of structure, after last feature field. */ diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c new file mode 100644 index 000000000000..357122dcb487 --- /dev/null +++ b/tools/testing/selftests/rseq/slice_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "rseq.h" + +#include "../kselftest_harness.h" + +#ifndef __NR_rseq_slice_yield +# define __NR_rseq_slice_yield 471 +#endif + +#define BITS_PER_INT 32 +#define BITS_PER_BYTE 8 + +#ifndef PR_RSEQ_SLICE_EXTENSION +# define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 +#endif + +#ifndef RSEQ_SLICE_EXT_REQUEST_BIT +# define RSEQ_SLICE_EXT_REQUEST_BIT 0 +# define RSEQ_SLICE_EXT_GRANTED_BIT 1 +#endif + +#ifndef asm_inline +# define asm_inline asm __inline +#endif + +#define NSEC_PER_SEC 1000000000L +#define NSEC_PER_USEC 1000L + +struct noise_params { + int64_t noise_nsecs; + int64_t sleep_nsecs; + int64_t run; +}; + +FIXTURE(slice_ext) +{ + pthread_t noise_thread; + struct noise_params noise_params; +}; + +FIXTURE_VARIANT(slice_ext) +{ + int64_t total_nsecs; + int64_t slice_nsecs; + int64_t noise_nsecs; + int64_t sleep_nsecs; + bool no_yield; +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n50_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 50LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50_no_yield) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, + .no_yield = true, +}; + + +static inline bool elapsed(struct timespec *start, struct timespec *now, + int64_t span) +{ + int64_t delta = now->tv_sec - start->tv_sec; + + delta *= NSEC_PER_SEC; + delta += now->tv_nsec - start->tv_nsec; + return delta >= span; +} + +static void *noise_thread(void *arg) +{ + struct noise_params *p = arg; + + while (RSEQ_READ_ONCE(p->run)) { + struct timespec ts_start, ts_now; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, p->noise_nsecs)); + + ts_start.tv_sec = 0; + ts_start.tv_nsec = p->sleep_nsecs; + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts_start, NULL); + } + return NULL; +} + +FIXTURE_SETUP(slice_ext) +{ + cpu_set_t affinity; + + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); + + /* Pin it on a single CPU. Avoid CPU 0 */ + for (int i = 1; i < CPU_SETSIZE; i++) { + if (!CPU_ISSET(i, &affinity)) + continue; + + CPU_ZERO(&affinity); + CPU_SET(i, &affinity); + ASSERT_EQ(sched_setaffinity(0, sizeof(affinity), &affinity), 0); + break; + } + + ASSERT_EQ(rseq_register_current_thread(), 0); + + ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); + + self->noise_params.noise_nsecs = variant->noise_nsecs; + self->noise_params.sleep_nsecs = variant->sleep_nsecs; + self->noise_params.run = 1; + + ASSERT_EQ(pthread_create(&self->noise_thread, NULL, noise_thread, &self->noise_params), 0); +} + +FIXTURE_TEARDOWN(slice_ext) +{ + self->noise_params.run = 0; + pthread_join(self->noise_thread, NULL); +} + +TEST_F(slice_ext, slice_test) +{ + unsigned long success = 0, yielded = 0, scheduled = 0, raced = 0; + unsigned long total = 0, aborted = 0; + struct rseq_abi *rs = rseq_get_abi(); + struct timespec ts_start, ts_now; + + ASSERT_NE(rs, NULL); + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + struct timespec ts_cs; + bool req = false; + + clock_gettime(CLOCK_MONOTONIC, &ts_cs); + + total++; + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 1); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_cs, &ts_now, variant->slice_nsecs)); + + /* + * request can be cleared unconditionally, but for making + * the stats work this is actually checking it first + */ + if (RSEQ_READ_ONCE(rs->slice_ctrl.request)) { + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 0); + /* Race between check and clear! */ + req = true; + success++; + } + + if (RSEQ_READ_ONCE(rs->slice_ctrl.granted)) { + /* The above raced against a late grant */ + if (req) + success--; + if (variant->no_yield) { + syscall(__NR_getpid); + aborted++; + } else { + yielded++; + if (!syscall(__NR_rseq_slice_yield)) + raced++; + } + } else { + if (!req) + scheduled++; + } + + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, variant->total_nsecs)); + + printf("# Total %12ld\n", total); + printf("# Success %12ld\n", success); + printf("# Yielded %12ld\n", yielded); + printf("# Aborted %12ld\n", aborted); + printf("# Scheduled %12ld\n", scheduled); + printf("# Raced %12ld\n", raced); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From bb332a9e5a057d2cb9b90e307b26cce9b1f6f660 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jan 2026 15:10:29 +0100 Subject: selftests/rseq: Add rseq slice histogram script A script that processes trace-cmd data and generates a histogram of rseq slice_ext durations for the recorded workload. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143208.340549136@infradead.org --- Documentation/userspace-api/rseq.rst | 3 + tools/testing/selftests/rseq/rseq-slice-hist.py | 132 ++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/rseq/rseq-slice-hist.py (limited to 'tools') diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst index 468f6bbe0e25..3cd27a3c7c7e 100644 --- a/Documentation/userspace-api/rseq.rst +++ b/Documentation/userspace-api/rseq.rst @@ -83,6 +83,9 @@ determined by debugfs:rseq/slice_ext_nsec. The default value is 5 usec; which is the minimum value. It can be incremented to 50 usecs, however doing so can/will affect the minimum scheduling latency. +Any proposed changes to this default will have to come with a selftest and +rseq-slice-hist.py output that shows the new value has merrit. + The kernel indicates the grant by clearing rseq::slice_ctrl::request and setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the thread after granting the extension, the kernel clears the granted bit to diff --git a/tools/testing/selftests/rseq/rseq-slice-hist.py b/tools/testing/selftests/rseq/rseq-slice-hist.py new file mode 100644 index 000000000000..b7933eeaefb9 --- /dev/null +++ b/tools/testing/selftests/rseq/rseq-slice-hist.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 + +# +# trace-cmd record -e hrtimer_start -e hrtimer_cancel -e hrtimer_expire_entry -- $cmd +# + +from tracecmd import * + +def load_kallsyms(file_path='/proc/kallsyms'): + """ + Parses /proc/kallsyms into a dictionary. + Returns: { address_int: symbol_name } + """ + kallsyms_map = {} + + try: + with open(file_path, 'r') as f: + for line in f: + # The format is: [address] [type] [name] [module] + parts = line.split() + if len(parts) < 3: + continue + + addr = int(parts[0], 16) + name = parts[2] + + kallsyms_map[addr] = name + + except PermissionError: + print(f"Error: Permission denied reading {file_path}. Try running with sudo.") + except FileNotFoundError: + print(f"Error: {file_path} not found.") + + return kallsyms_map + +ksyms = load_kallsyms() + +# pending[timer_ptr] = {'ts': timestamp, 'comm': comm} +pending = {} + +# histograms[comm][bucket] = count +histograms = {} + +class OnlineHarmonicMean: + def __init__(self): + self.n = 0 # Count of elements + self.S = 0.0 # Cumulative sum of reciprocals + + def update(self, x): + if x == 0: + raise ValueError("Harmonic mean is undefined for zero.") + + self.n += 1 + self.S += 1.0 / x + return self.n / self.S + + @property + def mean(self): + return self.n / self.S if self.n > 0 else 0 + +ohms = {} + +def handle_start(record): + func_name = ksyms[record.num_field("function")] + if "rseq_slice_expired" in func_name: + timer_ptr = record.num_field("hrtimer") + pending[timer_ptr] = { + 'ts': record.ts, + 'comm': record.comm + } + return None + +def handle_cancel(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + duration_ns = record.ts - start_data['ts'] + duration_us = duration_ns // 1000 + + comm = start_data['comm'] + + if comm not in ohms: + ohms[comm] = OnlineHarmonicMean() + + ohms[comm].update(duration_ns) + + if comm not in histograms: + histograms[comm] = {} + + histograms[comm][duration_us] = histograms[comm].get(duration_us, 0) + 1 + return None + +def handle_expire(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + comm = start_data['comm'] + + if comm not in histograms: + histograms[comm] = {} + + # Record -1 bucket for expired (failed to cancel) + histograms[comm][-1] = histograms[comm].get(-1, 0) + 1 + return None + +if __name__ == "__main__": + t = Trace("trace.dat") + for cpu in range(0, t.cpus): + ev = t.read_event(cpu) + while ev: + if "hrtimer_start" in ev.name: + handle_start(ev) + if "hrtimer_cancel" in ev.name: + handle_cancel(ev) + if "hrtimer_expire_entry" in ev.name: + handle_expire(ev) + + ev = t.read_event(cpu) + + print("\n" + "="*40) + print("RSEQ SLICE HISTOGRAM (us)") + print("="*40) + for comm, buckets in histograms.items(): + print(f"\nTask: {comm} Mean: {ohms[comm].mean:.3f} ns") + print(f" {'Latency (us)':<15} | {'Count'}") + print(f" {'-'*30}") + # Sort buckets numerically, putting -1 at the top + for bucket in sorted(buckets.keys()): + label = "EXPIRED" if bucket == -1 else f"{bucket} us" + print(f" {label:<15} | {buckets[bucket]}") -- cgit v1.2.3 From 2a369c4942489aeab799a7509b7cc721eecafa8a Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 17 Jan 2026 13:10:51 +0100 Subject: kselftest/arm64: Use syscall() macro over nolibc my_syscall() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The my_syscall*() macros are internal implementation details of nolibc. Nolibc also provides the regular syscall(2), which is also a macro and directly expands to the correct my_syscall(). Use syscall() instead. As a side-effect this fixes some return value checks, as my_syscall() returns the raw value as set by the kernel and does not set errno. Signed-off-by: Thomas Weißschuh Acked-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 3 +- tools/testing/selftests/arm64/gcs/basic-gcs.c | 40 +++++++++++---------------- 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 1703543fb7c7..ce4550fb7224 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -128,8 +128,7 @@ static int sys_clone(unsigned long clone_flags, unsigned long newsp, int *parent_tidptr, unsigned long tls, int *child_tidptr) { - return my_syscall5(__NR_clone, clone_flags, newsp, parent_tidptr, tls, - child_tidptr); + return syscall(__NR_clone, clone_flags, newsp, parent_tidptr, tls, child_tidptr); } #define __STACK_SIZE (8 * 1024 * 1024) diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 250977abc398..ae4cce6afe2b 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -22,7 +22,7 @@ static size_t page_size = 65536; static __attribute__((noinline)) void valid_gcs_function(void) { /* Do something the compiler can't optimise out */ - my_syscall1(__NR_prctl, PR_SVE_GET_VL); + syscall(__NR_prctl, PR_SVE_GET_VL); } static inline int gcs_set_status(unsigned long mode) @@ -36,12 +36,10 @@ static inline int gcs_set_status(unsigned long mode) * other 3 values passed in registers to the syscall are zero * since the kernel validates them. */ - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, - 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, 0, 0, 0); if (ret == 0) { - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &new_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &new_mode, 0, 0, 0); if (ret == 0) { if (new_mode != mode) { ksft_print_msg("Mode set to %lx not %lx\n", @@ -49,7 +47,7 @@ static inline int gcs_set_status(unsigned long mode) ret = -EINVAL; } } else { - ksft_print_msg("Failed to validate mode: %d\n", ret); + ksft_print_msg("Failed to validate mode: %d\n", errno); } if (enabling != chkfeat_gcs()) { @@ -69,10 +67,9 @@ static bool read_status(void) unsigned long state; int ret; - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &state, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &state, 0, 0, 0); if (ret != 0) { - ksft_print_msg("Failed to read state: %d\n", ret); + ksft_print_msg("Failed to read state: %d\n", errno); return false; } @@ -188,9 +185,8 @@ static bool map_guarded_stack(void) int elem; bool pass = true; - buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size, - SHADOW_STACK_SET_MARKER | - SHADOW_STACK_SET_TOKEN); + buf = (void *)syscall(__NR_map_shadow_stack, 0, page_size, + SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN); if (buf == MAP_FAILED) { ksft_print_msg("Failed to map %lu byte GCS: %d\n", page_size, errno); @@ -257,8 +253,7 @@ static bool test_fork(void) valid_gcs_function(); get_gcspr(); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &child_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0); if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { ksft_print_msg("GCS not enabled in child\n"); ret = -EINVAL; @@ -321,8 +316,7 @@ static bool test_vfork(void) valid_gcs_function(); get_gcspr(); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &child_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0); if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { ksft_print_msg("GCS not enabled in child\n"); ret = EXIT_FAILURE; @@ -390,17 +384,15 @@ int main(void) if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) ksft_exit_skip("SKIP GCS not supported\n"); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &gcs_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", errno); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, - gcs_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret); + ksft_exit_fail_msg("Failed to enable GCS: %d\n", errno); } ksft_set_plan(ARRAY_SIZE(tests)); @@ -410,9 +402,9 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) - ksft_print_msg("Failed to disable GCS: %d\n", ret); + ksft_print_msg("Failed to disable GCS: %d\n", errno); ksft_finished(); -- cgit v1.2.3 From 57a96356bb6942e16283138d0a42baad29169ed8 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 19 Jan 2026 10:29:28 +0800 Subject: kselftest/arm64: Add HWCAP test for FEAT_LS64 Add tests for FEAT_LS64. Issue related instructions if feature presents, no SIGILL should be received. When such instructions operate on Device memory or non-cacheable memory, we may received a SIGBUS during the test (w/o FEAT_LS64WB). Just ignore it since we only tested whether the instruction itself can be issued as expected on platforms declaring the support of such features. Acked-by: Arnd Bergmann Acked-by: Oliver Upton Signed-off-by: Yicong Yang Signed-off-by: Zhou Wang Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index c41640f18e4e..9d2df1f3e6bb 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -595,6 +597,45 @@ static void lrcpc3_sigill(void) : "=r" (data0), "=r" (data1) : "r" (src) :); } +static void ignore_signal(int sig, siginfo_t *info, void *context) +{ + ucontext_t *uc = context; + + uc->uc_mcontext.pc += 4; +} + +static void ls64_sigill(void) +{ + struct sigaction ign, old; + char src[64] __aligned(64) = { 1 }; + + /* + * LS64 requires target memory to be Device/Non-cacheable (if + * FEAT_LS64WB not supported) and the completer supports these + * instructions, otherwise we'll receive a SIGBUS. Since we are only + * testing the ABI here, so just ignore the SIGBUS and see if we can + * execute the instructions without receiving a SIGILL. Restore the + * handler of SIGBUS after this test. + */ + ign.sa_sigaction = ignore_signal; + ign.sa_flags = SA_SIGINFO | SA_RESTART; + sigemptyset(&ign.sa_mask); + sigaction(SIGBUS, &ign, &old); + + register void *xn asm ("x8") = src; + register u64 xt_1 asm ("x0"); + + /* LD64B x0, [x8] */ + asm volatile(".inst 0xf83fd100" : "=r" (xt_1) : "r" (xn) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7"); + + /* ST64B x0, [x8] */ + asm volatile(".inst 0xf83f9100" : : "r" (xt_1), "r" (xn) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7"); + + sigaction(SIGBUS, &old, NULL); +} + static const struct hwcap_data { const char *name; unsigned long at_hwcap; @@ -1134,6 +1175,14 @@ static const struct hwcap_data { .hwcap_bit = HWCAP3_MTE_STORE_ONLY, .cpuinfo = "mtestoreonly", }, + { + .name = "LS64", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LS64, + .cpuinfo = "ls64", + .sigill_fn = ls64_sigill, + .sigill_reliable = true, + }, }; typedef void (*sighandler_fn)(int, siginfo_t *, void *); -- cgit v1.2.3 From 0a98de80136968bab7db37b16282b37f044694d3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 Jan 2026 10:36:26 +0100 Subject: vsock/test: fix seqpacket message bounds test The test requires the sender (client) to send all messages before waking up the receiver (server). Since virtio-vsock had a bug and did not respect the size of the TX buffer, this test worked, but now that we are going to fix the bug, the test hangs because the sender would fill the TX buffer before waking up the receiver. Set the buffer size in the sender (client) as well, as we already do for the receiver (server). Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20260121093628.9941-3-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/vsock/vsock_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 27e39354499a..668fbe9eb3cc 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -351,6 +351,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) { + unsigned long long sock_buf_size; unsigned long curr_hash; size_t max_msg_size; int page_size; @@ -363,6 +364,16 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) exit(EXIT_FAILURE); } + sock_buf_size = SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + /* Wait, until receiver sets buffer size. */ control_expectln("SRVREADY"); -- cgit v1.2.3 From 2a689f76edd04a53137bd01d4618343f4cdd7e23 Mon Sep 17 00:00:00 2001 From: Melbin K Mathew Date: Wed, 21 Jan 2026 10:36:28 +0100 Subject: vsock/test: add stream TX credit bounds test Add a regression test for the TX credit bounds fix. The test verifies that a sender with a small local buffer size cannot queue excessive data even when the peer advertises a large receive buffer. The client: - Sets a small buffer size (64 KiB) - Connects to server (which advertises 2 MiB buffer) - Sends in non-blocking mode until EAGAIN - Verifies total queued data is bounded This guards against the original vulnerability where a remote peer could cause unbounded kernel memory allocation by advertising a large buffer and reading slowly. Suggested-by: Stefano Garzarella Signed-off-by: Melbin K Mathew [Stefano: use sock_buf_size to check the bytes sent + small fixes] Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20260121093628.9941-5-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/vsock/vsock_test.c | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'tools') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 668fbe9eb3cc..5bd20ccd9335 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -347,6 +347,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) } #define SOCK_BUF_SIZE (2 * 1024 * 1024) +#define SOCK_BUF_SIZE_SMALL (64 * 1024) #define MAX_MSG_PAGES 4 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) @@ -2230,6 +2231,101 @@ static void test_stream_accepted_setsockopt_server(const struct test_opts *opts) close(fd); } +static void test_stream_tx_credit_bounds_client(const struct test_opts *opts) +{ + unsigned long long sock_buf_size; + size_t total = 0; + char buf[4096]; + int fd; + + memset(buf, 'A', sizeof(buf)); + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + sock_buf_size = SOCK_BUF_SIZE_SMALL; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + + if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK) < 0) { + perror("fcntl(F_SETFL)"); + exit(EXIT_FAILURE); + } + + control_expectln("SRVREADY"); + + for (;;) { + ssize_t sent = send(fd, buf, sizeof(buf), 0); + + if (sent == 0) { + fprintf(stderr, "unexpected EOF while sending bytes\n"); + exit(EXIT_FAILURE); + } + + if (sent < 0) { + if (errno == EINTR) + continue; + + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + + perror("send"); + exit(EXIT_FAILURE); + } + + total += sent; + } + + control_writeln("CLIDONE"); + close(fd); + + /* We should not be able to send more bytes than the value set as + * local buffer size. + */ + if (total > sock_buf_size) { + fprintf(stderr, + "TX credit too large: queued %zu bytes (expected <= %llu)\n", + total, sock_buf_size); + exit(EXIT_FAILURE); + } +} + +static void test_stream_tx_credit_bounds_server(const struct test_opts *opts) +{ + unsigned long long sock_buf_size; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + sock_buf_size = SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + + control_writeln("SRVREADY"); + control_expectln("CLIDONE"); + + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -2419,6 +2515,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_msgzcopy_mangle_client, .run_server = test_stream_msgzcopy_mangle_server, }, + { + .name = "SOCK_STREAM TX credit bounds", + .run_client = test_stream_tx_credit_bounds_client, + .run_server = test_stream_tx_credit_bounds_server, + }, {}, }; -- cgit v1.2.3 From c3030995f23b3d35f94b9bc4375706ec5916fd55 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:25 +0000 Subject: perf sched stats: Add record and rawdump support Define new, perf tool only, sample types and their layouts. Add logic to parse /proc/schedstat, convert it to perf sample format and save samples to perf.data file with `perf sched stats record` command. Also add logic to read perf.data file, interpret schedstat samples and print rawdump of samples with `perf script -D`. Note that, /proc/schedstat file output is standardized with version number. The patch supports v15 but older or newer version can be added easily. Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Tested-by: James Clark Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han [ PRIu64 needs uint64_t, not 'unsigned long' to work on both 32-bit and 64-bit ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Documentation/libperf.txt | 2 + tools/lib/perf/Makefile | 1 + tools/lib/perf/include/perf/event.h | 41 +++++ tools/lib/perf/include/perf/schedstat-v15.h | 146 ++++++++++++++++++ tools/perf/builtin-inject.c | 2 + tools/perf/builtin-sched.c | 222 +++++++++++++++++++++++++++- tools/perf/util/event.c | 40 +++++ tools/perf/util/event.h | 2 + tools/perf/util/session.c | 22 +++ tools/perf/util/synthetic-events.c | 179 ++++++++++++++++++++++ tools/perf/util/synthetic-events.h | 3 + tools/perf/util/tool.c | 20 +++ tools/perf/util/tool.h | 4 +- 13 files changed, 682 insertions(+), 2 deletions(-) create mode 100644 tools/lib/perf/include/perf/schedstat-v15.h (limited to 'tools') diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt index 4072bc9b7670..576ecc5fc312 100644 --- a/tools/lib/perf/Documentation/libperf.txt +++ b/tools/lib/perf/Documentation/libperf.txt @@ -211,6 +211,8 @@ SYNOPSIS struct perf_record_header_feature; struct perf_record_compressed; struct perf_record_compressed2; + struct perf_record_schedstat_cpu; + struct perf_record_schedstat_domain; -- DESCRIPTION diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile index 7fbb50b74c00..9fa28e512ca8 100644 --- a/tools/lib/perf/Makefile +++ b/tools/lib/perf/Makefile @@ -179,6 +179,7 @@ install_lib: libs cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ) HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h +HDRS += schedstat-v15.h INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 43a8cb04994f..ce04fed7cefc 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -496,6 +496,43 @@ struct perf_record_bpf_metadata { struct perf_record_bpf_metadata_entry entries[]; }; +struct perf_record_schedstat_cpu_v15 { +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) _type _name +#include "schedstat-v15.h" +#undef CPU_FIELD +}; + +struct perf_record_schedstat_cpu { + struct perf_event_header header; + __u64 timestamp; + __u32 cpu; + __u16 version; + /* Padding */ + char __pad[2]; + union { + struct perf_record_schedstat_cpu_v15 v15; + }; +}; + +struct perf_record_schedstat_domain_v15 { +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) _type _name +#include "schedstat-v15.h" +#undef DOMAIN_FIELD +}; + +#define DOMAIN_NAME_LEN 16 + +struct perf_record_schedstat_domain { + struct perf_event_header header; + __u64 timestamp; + __u32 cpu; + __u16 version; + __u16 domain; + union { + struct perf_record_schedstat_domain_v15 v15; + }; +}; + enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_USER_TYPE_START = 64, PERF_RECORD_HEADER_ATTR = 64, @@ -519,6 +556,8 @@ enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_FINISHED_INIT = 82, PERF_RECORD_COMPRESSED2 = 83, PERF_RECORD_BPF_METADATA = 84, + PERF_RECORD_SCHEDSTAT_CPU = 85, + PERF_RECORD_SCHEDSTAT_DOMAIN = 86, PERF_RECORD_HEADER_MAX }; @@ -562,6 +601,8 @@ union perf_event { struct perf_record_compressed pack; struct perf_record_compressed2 pack2; struct perf_record_bpf_metadata bpf_metadata; + struct perf_record_schedstat_cpu schedstat_cpu; + struct perf_record_schedstat_domain schedstat_domain; }; #endif /* __LIBPERF_EVENT_H */ diff --git a/tools/lib/perf/include/perf/schedstat-v15.h b/tools/lib/perf/include/perf/schedstat-v15.h new file mode 100644 index 000000000000..639458df05f8 --- /dev/null +++ b/tools/lib/perf/include/perf/schedstat-v15.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifdef CPU_FIELD +CPU_FIELD(__u32, yld_count, "sched_yield() count", + "%11u", false, yld_count, v15); +CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored", + "%11u", false, array_exp, v15); +CPU_FIELD(__u32, sched_count, "schedule() called", + "%11u", false, sched_count, v15); +CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle", + "%11u", true, sched_count, v15); +CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called", + "%11u", false, ttwu_count, v15); +CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu", + "%11u", true, ttwu_count, v15); +CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)", + "%11llu", false, rq_cpu_time, v15); +CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)", + "%11llu", true, rq_cpu_time, v15); +CPU_FIELD(__u64, pcount, "total timeslices run on this cpu", + "%11llu", false, pcount, v15); +#endif + +#ifdef DOMAIN_FIELD +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, idle_lb_count, + "load_balance() count on cpu idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, idle_lb_balanced, + "load_balance() found balanced on cpu idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, idle_lb_failed, + "load_balance() move task failed on cpu idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, idle_lb_imbalance, + "imbalance sum on cpu idle", "%11u", false, v15); +DOMAIN_FIELD(__u32, idle_lb_gained, + "pull_task() count on cpu idle", "%11u", false, v15); +DOMAIN_FIELD(__u32, idle_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v15); +DOMAIN_FIELD(__u32, idle_lb_nobusyq, + "load_balance() failed to find busier queue on cpu idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, idle_lb_nobusyg, + "load_balance() failed to find busier group on cpu idle", "%11u", true, v15); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u", + idle_lb_count, idle_lb_balanced, idle_lb_failed, v15); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(idle_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf", + idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v15); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, busy_lb_count, + "load_balance() count on cpu busy", "%11u", true, v15); +DOMAIN_FIELD(__u32, busy_lb_balanced, + "load_balance() found balanced on cpu busy", "%11u", true, v15); +DOMAIN_FIELD(__u32, busy_lb_failed, + "load_balance() move task failed on cpu busy", "%11u", true, v15); +DOMAIN_FIELD(__u32, busy_lb_imbalance, + "imbalance sum on cpu busy", "%11u", false, v15); +DOMAIN_FIELD(__u32, busy_lb_gained, + "pull_task() count on cpu busy", "%11u", false, v15); +DOMAIN_FIELD(__u32, busy_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v15); +DOMAIN_FIELD(__u32, busy_lb_nobusyq, + "load_balance() failed to find busier queue on cpu busy", "%11u", true, v15); +DOMAIN_FIELD(__u32, busy_lb_nobusyg, + "load_balance() failed to find busier group on cpu busy", "%11u", true, v15); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u", + busy_lb_count, busy_lb_balanced, busy_lb_failed, v15); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(busy_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf", + busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v15); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, newidle_lb_count, + "load_balance() count on cpu newly idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, newidle_lb_balanced, + "load_balance() found balanced on cpu newly idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, newidle_lb_failed, + "load_balance() move task failed on cpu newly idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, newidle_lb_imbalance, + "imbalance sum on cpu newly idle", "%11u", false, v15); +DOMAIN_FIELD(__u32, newidle_lb_gained, + "pull_task() count on cpu newly idle", "%11u", false, v15); +DOMAIN_FIELD(__u32, newidle_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v15); +DOMAIN_FIELD(__u32, newidle_lb_nobusyq, + "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v15); +DOMAIN_FIELD(__u32, newidle_lb_nobusyg, + "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v15); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(newidle_lb_success_count, + "load_balance() success count on cpu newly idle", "%11u", + newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v15); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(newidle_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf", + newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v15); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, alb_count, + "active_load_balance() count", "%11u", false, v15); +DOMAIN_FIELD(__u32, alb_failed, + "active_load_balance() move task failed", "%11u", false, v15); +DOMAIN_FIELD(__u32, alb_pushed, + "active_load_balance() successfully moved a task", "%11u", false, v15); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, sbe_count, + "sbe_count is not used", "%11u", false, v15); +DOMAIN_FIELD(__u32, sbe_balanced, + "sbe_balanced is not used", "%11u", false, v15); +DOMAIN_FIELD(__u32, sbe_pushed, + "sbe_pushed is not used", "%11u", false, v15); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, sbf_count, + "sbf_count is not used", "%11u", false, v15); +DOMAIN_FIELD(__u32, sbf_balanced, + "sbf_balanced is not used", "%11u", false, v15); +DOMAIN_FIELD(__u32, sbf_pushed, + "sbf_pushed is not used", "%11u", false, v15); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, ttwu_wake_remote, + "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v15); +DOMAIN_FIELD(__u32, ttwu_move_affine, + "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v15); +DOMAIN_FIELD(__u32, ttwu_move_balance, + "try_to_wake_up() started passive balancing", "%11u", false, v15); +#endif /* DOMAIN_FIELD */ diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index c89ac85ec112..2c9456614cde 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2657,6 +2657,8 @@ int cmd_inject(int argc, const char **argv) inject.tool.compressed = perf_event__repipe_op4_synth; inject.tool.auxtrace = perf_event__repipe_auxtrace; inject.tool.bpf_metadata = perf_event__repipe_op2_synth; + inject.tool.schedstat_cpu = perf_event__repipe_op2_synth; + inject.tool.schedstat_domain = perf_event__repipe_op2_synth; inject.tool.dont_split_sample_group = true; inject.tool.merge_deferred_callchains = false; inject.session = __perf_session__new(&data, &inject.tool, diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index eca3b1c58c4b..ee3b4e42156e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -28,6 +28,8 @@ #include "util/debug.h" #include "util/event.h" #include "util/util.h" +#include "util/synthetic-events.h" +#include "util/target.h" #include #include @@ -55,6 +57,7 @@ #define MAX_PRIO 140 static const char *cpu_list; +static struct perf_cpu_map *user_requested_cpus; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); struct sched_atom; @@ -236,6 +239,9 @@ struct perf_sched { volatile bool thread_funcs_exit; const char *prio_str; DECLARE_BITMAP(prio_bitmap, MAX_PRIO); + + struct perf_session *session; + struct perf_data *data; }; /* per thread run time data */ @@ -3734,6 +3740,195 @@ static void setup_sorting(struct perf_sched *sched, const struct option *options sort_dimension__add("pid", &sched->cmp_pid); } +static int process_synthesized_schedstat_event(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + struct perf_sched *sched = container_of(tool, struct perf_sched, tool); + + if (perf_data__write(sched->data, event, event->header.size) <= 0) { + pr_err("failed to write perf data, error: %m\n"); + return -1; + } + + sched->session->header.data_size += event->header.size; + return 0; +} + +static void sighandler(int sig __maybe_unused) +{ +} + +static int enable_sched_schedstats(int *reset) +{ + char path[PATH_MAX]; + FILE *fp; + char ch; + + snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint()); + fp = fopen(path, "w+"); + if (!fp) { + pr_err("Failed to open %s\n", path); + return -1; + } + + ch = getc(fp); + if (ch == '0') { + *reset = 1; + rewind(fp); + putc('1', fp); + fclose(fp); + } + return 0; +} + +static int disable_sched_schedstat(void) +{ + char path[PATH_MAX]; + FILE *fp; + + snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint()); + fp = fopen(path, "w"); + if (!fp) { + pr_err("Failed to open %s\n", path); + return -1; + } + + putc('0', fp); + fclose(fp); + return 0; +} + +/* perf.data or any other output file name used by stats subcommand (only). */ +const char *output_name; + +static int perf_sched__schedstat_record(struct perf_sched *sched, + int argc, const char **argv) +{ + struct perf_session *session; + struct target target = {}; + struct evlist *evlist; + int reset = 0; + int err = 0; + int fd; + struct perf_data data = { + .path = output_name, + .mode = PERF_DATA_MODE_WRITE, + }; + + signal(SIGINT, sighandler); + signal(SIGCHLD, sighandler); + signal(SIGTERM, sighandler); + + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + + session = perf_session__new(&data, &sched->tool); + if (IS_ERR(session)) { + pr_err("Perf session creation failed.\n"); + evlist__delete(evlist); + return PTR_ERR(session); + } + + session->evlist = evlist; + + sched->session = session; + sched->data = &data; + + fd = perf_data__fd(&data); + + /* + * Capture all important metadata about the system. Although they are + * not used by `perf sched stats` tool directly, they provide useful + * information about profiled environment. + */ + perf_header__set_feat(&session->header, HEADER_HOSTNAME); + perf_header__set_feat(&session->header, HEADER_OSRELEASE); + perf_header__set_feat(&session->header, HEADER_VERSION); + perf_header__set_feat(&session->header, HEADER_ARCH); + perf_header__set_feat(&session->header, HEADER_NRCPUS); + perf_header__set_feat(&session->header, HEADER_CPUDESC); + perf_header__set_feat(&session->header, HEADER_CPUID); + perf_header__set_feat(&session->header, HEADER_TOTAL_MEM); + perf_header__set_feat(&session->header, HEADER_CMDLINE); + perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY); + perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY); + perf_header__set_feat(&session->header, HEADER_CACHE); + perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY); + perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY); + perf_header__set_feat(&session->header, HEADER_CPU_DOMAIN_INFO); + + err = perf_session__write_header(session, evlist, fd, false); + if (err < 0) + goto out; + + /* + * `perf sched stats` does not support workload profiling (-p pid) + * since /proc/schedstat file contains cpu specific data only. Hence, a + * profile target is either set of cpus or systemwide, never a process. + * Note that, although `-- ` is supported, profile data are + * still cpu/systemwide. + */ + if (cpu_list) + target.cpu_list = cpu_list; + else + target.system_wide = true; + + if (argc) { + err = evlist__prepare_workload(evlist, &target, argv, false, NULL); + if (err) + goto out; + } + + err = evlist__create_maps(evlist, &target); + if (err < 0) + goto out; + + user_requested_cpus = evlist->core.user_requested_cpus; + + err = perf_event__synthesize_schedstat(&(sched->tool), + process_synthesized_schedstat_event, + user_requested_cpus); + if (err < 0) + goto out; + + err = enable_sched_schedstats(&reset); + if (err < 0) + goto out; + + if (argc) + evlist__start_workload(evlist); + + /* wait for signal */ + pause(); + + if (reset) { + err = disable_sched_schedstat(); + if (err < 0) + goto out; + } + + err = perf_event__synthesize_schedstat(&(sched->tool), + process_synthesized_schedstat_event, + user_requested_cpus); + if (err < 0) + goto out; + + err = perf_session__write_header(session, evlist, fd, true); + +out: + if (!err) + fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path); + else + fprintf(stderr, "[ perf sched stats: Failed !! ]\n"); + + evlist__delete(evlist); + close(fd); + return err; +} + static bool schedstat_events_exposed(void) { /* @@ -3910,6 +4105,12 @@ int cmd_sched(int argc, const char **argv) OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), OPT_PARENT(sched_options) }; + const struct option stats_options[] = { + OPT_STRING('o', "output", &output_name, "file", + "`stats record` with output filename"), + OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_END() + }; const char * const latency_usage[] = { "perf sched latency []", @@ -3927,9 +4128,13 @@ int cmd_sched(int argc, const char **argv) "perf sched timehist []", NULL }; + const char *stats_usage[] = { + "perf sched stats {record} []", + NULL + }; const char *const sched_subcommands[] = { "record", "latency", "map", "replay", "script", - "timehist", NULL }; + "timehist", "stats", NULL }; const char *sched_usage[] = { NULL, NULL @@ -4027,6 +4232,21 @@ int cmd_sched(int argc, const char **argv) ret = symbol__validate_sym_arguments(); if (!ret) ret = perf_sched__timehist(&sched); + } else if (!strcmp(argv[0], "stats")) { + const char *const stats_subcommands[] = {"record", NULL}; + + argc = parse_options_subcommand(argc, argv, stats_options, + stats_subcommands, + stats_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + + if (argv[0] && !strcmp(argv[0], "record")) { + if (argc) + argc = parse_options(argc, argv, stats_options, + stats_usage, 0); + return perf_sched__schedstat_record(&sched, argc, argv); + } + usage_with_options(stats_usage, stats_options); } else { usage_with_options(sched_usage, sched_options); } diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 4c92cc1a952c..5a98c16e1092 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -83,6 +83,8 @@ static const char *perf_event__names[] = { [PERF_RECORD_FINISHED_INIT] = "FINISHED_INIT", [PERF_RECORD_COMPRESSED2] = "COMPRESSED2", [PERF_RECORD_BPF_METADATA] = "BPF_METADATA", + [PERF_RECORD_SCHEDSTAT_CPU] = "SCHEDSTAT_CPU", + [PERF_RECORD_SCHEDSTAT_DOMAIN] = "SCHEDSTAT_DOMAIN", }; const char *perf_event__name(unsigned int id) @@ -571,6 +573,44 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma return ret; } +size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp) +{ + struct perf_record_schedstat_cpu *cs = &event->schedstat_cpu; + size_t size = fprintf(fp, "\ncpu%u ", cs->cpu); + __u16 version = cs->version; + +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ + size += fprintf(fp, "%" PRIu64 " ", (uint64_t)cs->_ver._name) + + if (version == 15) { +#include + return size; + } +#undef CPU_FIELD + + return fprintf(fp, "Unsupported /proc/schedstat version %d.\n", + event->schedstat_cpu.version); +} + +size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp) +{ + struct perf_record_schedstat_domain *ds = &event->schedstat_domain; + __u16 version = ds->version; + size_t size = fprintf(fp, "\ndomain%u ", ds->domain); + +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ + size += fprintf(fp, "%" PRIu64 " ", (uint64_t)ds->_ver._name) + + if (version == 15) { +#include + return size; + } +#undef DOMAIN_FIELD + + return fprintf(fp, "Unsupported /proc/schedstat version %d.\n", + event->schedstat_domain.version); +} + size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp) { size_t ret = fprintf(fp, "PERF_RECORD_%s", diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 64c63b59d617..2ea83fdf8a03 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -392,6 +392,8 @@ size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp); size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp); size_t perf_event__fprintf_bpf_metadata(union perf_event *event, FILE *fp); size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *machine,FILE *fp); +size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp); +size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp); size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp); int kallsyms__get_function_start(const char *kallsyms_filename, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index d7b28cb4e672..c0231bc000e7 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -698,6 +698,20 @@ static void perf_event__time_conv_swap(union perf_event *event, } } +static void +perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused, + bool sample_id_all __maybe_unused) +{ + /* FIXME */ +} + +static void +perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused, + bool sample_id_all __maybe_unused) +{ + /* FIXME */ +} + typedef void (*perf_event__swap_op)(union perf_event *event, bool sample_id_all); @@ -737,6 +751,8 @@ static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap, [PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap, [PERF_RECORD_TIME_CONV] = perf_event__time_conv_swap, + [PERF_RECORD_SCHEDSTAT_CPU] = perf_event__schedstat_cpu_swap, + [PERF_RECORD_SCHEDSTAT_DOMAIN] = perf_event__schedstat_domain_swap, [PERF_RECORD_HEADER_MAX] = NULL, }; @@ -1667,6 +1683,12 @@ static s64 perf_session__process_user_event(struct perf_session *session, case PERF_RECORD_BPF_METADATA: err = tool->bpf_metadata(tool, session, event); break; + case PERF_RECORD_SCHEDSTAT_CPU: + err = tool->schedstat_cpu(tool, session, event); + break; + case PERF_RECORD_SCHEDSTAT_DOMAIN: + err = tool->schedstat_domain(tool, session, event); + break; default: err = -EINVAL; break; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 2ba9fa25e00a..5366ea921e70 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -2529,3 +2529,182 @@ int parse_synth_opt(char *synth) return ret; } + +static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version, + __u64 *cpu, __u64 timestamp) +{ + struct perf_record_schedstat_cpu *cs; + union perf_event *event; + size_t size; + char ch; + + size = sizeof(*cs); + size = PERF_ALIGN(size, sizeof(u64)); + event = zalloc(size); + + if (!event) + return NULL; + + cs = &event->schedstat_cpu; + cs->header.type = PERF_RECORD_SCHEDSTAT_CPU; + cs->header.size = size; + cs->timestamp = timestamp; + + if (io__get_char(io) != 'p' || io__get_char(io) != 'u') + goto out_cpu; + + if (io__get_dec(io, (__u64 *)cpu) != ' ') + goto out_cpu; + +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ + do { \ + __u64 _tmp; \ + ch = io__get_dec(io, &_tmp); \ + if (ch != ' ' && ch != '\n') \ + goto out_cpu; \ + cs->_ver._name = _tmp; \ + } while (0) + + if (version == 15) { +#include + } +#undef CPU_FIELD + + cs->cpu = *cpu; + cs->version = version; + + return event; +out_cpu: + free(event); + return NULL; +} + +static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 version, + __u64 cpu, __u64 timestamp) +{ + struct perf_record_schedstat_domain *ds; + union perf_event *event = NULL; + __u64 d_num; + size_t size; + char ch; + + if (io__get_char(io) != 'o' || io__get_char(io) != 'm' || io__get_char(io) != 'a' || + io__get_char(io) != 'i' || io__get_char(io) != 'n') + return NULL; + + ch = io__get_dec(io, &d_num); + + /* Skip cpumask as it can be extracted from perf header */ + while (io__get_char(io) != ' ') + continue; + + size = sizeof(*ds); + size = PERF_ALIGN(size, sizeof(u64)); + event = zalloc(size); + + ds = &event->schedstat_domain; + ds->header.type = PERF_RECORD_SCHEDSTAT_DOMAIN; + ds->header.size = size; + ds->version = version; + ds->timestamp = timestamp; + ds->domain = d_num; + +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ + do { \ + __u64 _tmp; \ + ch = io__get_dec(io, &_tmp); \ + if (ch != ' ' && ch != '\n') \ + goto out_domain; \ + ds->_ver._name = _tmp; \ + } while (0) + + if (version == 15) { +#include + } +#undef DOMAIN_FIELD + + ds->cpu = cpu; + goto out; + +out_domain: + free(event); + event = NULL; +out: + return event; +} + +int perf_event__synthesize_schedstat(const struct perf_tool *tool, + perf_event__handler_t process, + struct perf_cpu_map *user_requested_cpus) +{ + char *line = NULL, path[PATH_MAX]; + union perf_event *event = NULL; + size_t line_len = 0; + char bf[BUFSIZ]; + __u64 timestamp; + __u64 cpu = -1; + __u16 version; + struct io io; + int ret = -1; + char ch; + + snprintf(path, PATH_MAX, "%s/schedstat", procfs__mountpoint()); + io.fd = open(path, O_RDONLY, 0); + if (io.fd < 0) { + pr_err("Failed to open %s. Possibly CONFIG_SCHEDSTAT is disabled.\n", path); + return -1; + } + io__init(&io, io.fd, bf, sizeof(bf)); + + if (io__getline(&io, &line, &line_len) < 0 || !line_len) + goto out; + + if (!strcmp(line, "version 15\n")) { + version = 15; + } else { + pr_err("Unsupported %s version: %s", path, line + 8); + goto out_free_line; + } + + if (io__getline(&io, &line, &line_len) < 0 || !line_len) + goto out_free_line; + timestamp = atol(line + 10); + + /* + * FIXME: Can be optimized a bit by not synthesizing domain samples + * for filtered out cpus. + */ + for (ch = io__get_char(&io); !io.eof; ch = io__get_char(&io)) { + struct perf_cpu this_cpu; + + if (ch == 'c') { + event = __synthesize_schedstat_cpu(&io, version, + &cpu, timestamp); + } else if (ch == 'd') { + event = __synthesize_schedstat_domain(&io, version, + cpu, timestamp); + } + if (!event) + goto out_free_line; + + this_cpu.cpu = cpu; + + if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu)) + continue; + + if (process(tool, event, NULL, NULL) < 0) { + free(event); + goto out_free_line; + } + + free(event); + } + + ret = 0; + +out_free_line: + free(line); +out: + close(io.fd); + return ret; +} diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h index f8588b6cf11a..b0edad0c3100 100644 --- a/tools/perf/util/synthetic-events.h +++ b/tools/perf/util/synthetic-events.h @@ -128,4 +128,7 @@ int perf_event__synthesize_for_pipe(const struct perf_tool *tool, struct perf_data *data, perf_event__handler_t process); +int perf_event__synthesize_schedstat(const struct perf_tool *tool, + perf_event__handler_t process, + struct perf_cpu_map *user_requested_cpu); #endif // __PERF_SYNTHETIC_EVENTS_H diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c index 27ba5849c74a..013c7839e2cf 100644 --- a/tools/perf/util/tool.c +++ b/tools/perf/util/tool.c @@ -253,7 +253,25 @@ static int perf_event__process_bpf_metadata_stub(const struct perf_tool *tool __ { if (dump_trace) perf_event__fprintf_bpf_metadata(event, stdout); + dump_printf(": unhandled!\n"); + return 0; +} +static int process_schedstat_cpu_stub(const struct perf_tool *tool __maybe_unused, + struct perf_session *perf_session __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_schedstat_cpu(event, stdout); + dump_printf(": unhandled!\n"); + return 0; +} +static int process_schedstat_domain_stub(const struct perf_tool *tool __maybe_unused, + struct perf_session *perf_session __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_schedstat_domain(event, stdout); dump_printf(": unhandled!\n"); return 0; } @@ -317,6 +335,8 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events) #endif tool->finished_init = process_event_op2_stub; tool->bpf_metadata = perf_event__process_bpf_metadata_stub; + tool->schedstat_cpu = process_schedstat_cpu_stub; + tool->schedstat_domain = process_schedstat_domain_stub; } bool perf_tool__compressed_is_stub(const struct perf_tool *tool) diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index e96b69d25a5b..2d9a4b1ca9d0 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -81,7 +81,9 @@ struct perf_tool { stat_round, feature, finished_init, - bpf_metadata; + bpf_metadata, + schedstat_cpu, + schedstat_domain; event_op4 compressed; event_op3 auxtrace; bool ordered_events; -- cgit v1.2.3 From 55657d7ac8caa98c7c0ef241bf64e176db899b4d Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:26 +0000 Subject: perf sched stats: Add schedstat v16 support The /proc/schedstat file output is standardized with version number. Add support to record and raw dump v16 version layout. Version 16 of schedstats changed the order of definitions within 'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] columns in show_schedstat(). In particular the position of CPU_IDLE and __CPU_NOT_IDLE changed places. Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Tested-by: James Clark Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Makefile | 2 +- tools/lib/perf/include/perf/event.h | 14 +++ tools/lib/perf/include/perf/schedstat-v16.h | 146 ++++++++++++++++++++++++++++ tools/perf/util/event.c | 6 ++ tools/perf/util/synthetic-events.c | 6 ++ 5 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 tools/lib/perf/include/perf/schedstat-v16.h (limited to 'tools') diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile index 9fa28e512ca8..965e066fd780 100644 --- a/tools/lib/perf/Makefile +++ b/tools/lib/perf/Makefile @@ -179,7 +179,7 @@ install_lib: libs cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ) HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h -HDRS += schedstat-v15.h +HDRS += schedstat-v15.h schedstat-v16.h INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index ce04fed7cefc..bd4d507ea8ab 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -502,6 +502,12 @@ struct perf_record_schedstat_cpu_v15 { #undef CPU_FIELD }; +struct perf_record_schedstat_cpu_v16 { +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) _type _name +#include "schedstat-v16.h" +#undef CPU_FIELD +}; + struct perf_record_schedstat_cpu { struct perf_event_header header; __u64 timestamp; @@ -511,6 +517,7 @@ struct perf_record_schedstat_cpu { char __pad[2]; union { struct perf_record_schedstat_cpu_v15 v15; + struct perf_record_schedstat_cpu_v16 v16; }; }; @@ -520,6 +527,12 @@ struct perf_record_schedstat_domain_v15 { #undef DOMAIN_FIELD }; +struct perf_record_schedstat_domain_v16 { +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) _type _name +#include "schedstat-v16.h" +#undef DOMAIN_FIELD +}; + #define DOMAIN_NAME_LEN 16 struct perf_record_schedstat_domain { @@ -530,6 +543,7 @@ struct perf_record_schedstat_domain { __u16 domain; union { struct perf_record_schedstat_domain_v15 v15; + struct perf_record_schedstat_domain_v16 v16; }; }; diff --git a/tools/lib/perf/include/perf/schedstat-v16.h b/tools/lib/perf/include/perf/schedstat-v16.h new file mode 100644 index 000000000000..3462b79c29af --- /dev/null +++ b/tools/lib/perf/include/perf/schedstat-v16.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifdef CPU_FIELD +CPU_FIELD(__u32, yld_count, "sched_yield() count", + "%11u", false, yld_count, v16); +CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored", + "%11u", false, array_exp, v16); +CPU_FIELD(__u32, sched_count, "schedule() called", + "%11u", false, sched_count, v16); +CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle", + "%11u", true, sched_count, v16); +CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called", + "%11u", false, ttwu_count, v16); +CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu", + "%11u", true, ttwu_count, v16); +CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)", + "%11llu", false, rq_cpu_time, v16); +CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)", + "%11llu", true, rq_cpu_time, v16); +CPU_FIELD(__u64, pcount, "total timeslices run on this cpu", + "%11llu", false, pcount, v16); +#endif /* CPU_FIELD */ + +#ifdef DOMAIN_FIELD +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, busy_lb_count, + "load_balance() count on cpu busy", "%11u", true, v16); +DOMAIN_FIELD(__u32, busy_lb_balanced, + "load_balance() found balanced on cpu busy", "%11u", true, v16); +DOMAIN_FIELD(__u32, busy_lb_failed, + "load_balance() move task failed on cpu busy", "%11u", true, v16); +DOMAIN_FIELD(__u32, busy_lb_imbalance, + "imbalance sum on cpu busy", "%11u", false, v16); +DOMAIN_FIELD(__u32, busy_lb_gained, + "pull_task() count on cpu busy", "%11u", false, v16); +DOMAIN_FIELD(__u32, busy_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v16); +DOMAIN_FIELD(__u32, busy_lb_nobusyq, + "load_balance() failed to find busier queue on cpu busy", "%11u", true, v16); +DOMAIN_FIELD(__u32, busy_lb_nobusyg, + "load_balance() failed to find busier group on cpu busy", "%11u", true, v16); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u", + busy_lb_count, busy_lb_balanced, busy_lb_failed, v16); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(busy_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf", + busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v16); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, idle_lb_count, + "load_balance() count on cpu idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, idle_lb_balanced, + "load_balance() found balanced on cpu idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, idle_lb_failed, + "load_balance() move task failed on cpu idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, idle_lb_imbalance, + "imbalance sum on cpu idle", "%11u", false, v16); +DOMAIN_FIELD(__u32, idle_lb_gained, + "pull_task() count on cpu idle", "%11u", false, v16); +DOMAIN_FIELD(__u32, idle_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v16); +DOMAIN_FIELD(__u32, idle_lb_nobusyq, + "load_balance() failed to find busier queue on cpu idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, idle_lb_nobusyg, + "load_balance() failed to find busier group on cpu idle", "%11u", true, v16); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u", + idle_lb_count, idle_lb_balanced, idle_lb_failed, v16); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(idle_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf", + idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v16); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, newidle_lb_count, + "load_balance() count on cpu newly idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, newidle_lb_balanced, + "load_balance() found balanced on cpu newly idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, newidle_lb_failed, + "load_balance() move task failed on cpu newly idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, newidle_lb_imbalance, + "imbalance sum on cpu newly idle", "%11u", false, v16); +DOMAIN_FIELD(__u32, newidle_lb_gained, + "pull_task() count on cpu newly idle", "%11u", false, v16); +DOMAIN_FIELD(__u32, newidle_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v16); +DOMAIN_FIELD(__u32, newidle_lb_nobusyq, + "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v16); +DOMAIN_FIELD(__u32, newidle_lb_nobusyg, + "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v16); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(newidle_lb_success_count, + "load_balance() success count on cpu newly idle", "%11u", + newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v16); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(newidle_lb_avg_count, + "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf", + newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v16); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, alb_count, + "active_load_balance() count", "%11u", false, v16); +DOMAIN_FIELD(__u32, alb_failed, + "active_load_balance() move task failed", "%11u", false, v16); +DOMAIN_FIELD(__u32, alb_pushed, + "active_load_balance() successfully moved a task", "%11u", false, v16); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, sbe_count, + "sbe_count is not used", "%11u", false, v16); +DOMAIN_FIELD(__u32, sbe_balanced, + "sbe_balanced is not used", "%11u", false, v16); +DOMAIN_FIELD(__u32, sbe_pushed, + "sbe_pushed is not used", "%11u", false, v16); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, sbf_count, + "sbf_count is not used", "%11u", false, v16); +DOMAIN_FIELD(__u32, sbf_balanced, + "sbf_balanced is not used", "%11u", false, v16); +DOMAIN_FIELD(__u32, sbf_pushed, + "sbf_pushed is not used", "%11u", false, v16); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, ttwu_wake_remote, + "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v16); +DOMAIN_FIELD(__u32, ttwu_move_affine, + "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v16); +DOMAIN_FIELD(__u32, ttwu_move_balance, + "try_to_wake_up() started passive balancing", "%11u", false, v16); +#endif /* DOMAIN_FIELD */ diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 5a98c16e1092..730021cec161 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -585,6 +585,9 @@ size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp) if (version == 15) { #include return size; + } else if (version == 16) { +#include + return size; } #undef CPU_FIELD @@ -604,6 +607,9 @@ size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp) if (version == 15) { #include return size; + } else if (version == 16) { +#include + return size; } #undef DOMAIN_FIELD diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 5366ea921e70..4ce37357db05 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -2567,6 +2567,8 @@ static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version if (version == 15) { #include + } else if (version == 16) { +#include } #undef CPU_FIELD @@ -2620,6 +2622,8 @@ static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 vers if (version == 15) { #include + } else if (version == 16) { +#include } #undef DOMAIN_FIELD @@ -2661,6 +2665,8 @@ int perf_event__synthesize_schedstat(const struct perf_tool *tool, if (!strcmp(line, "version 15\n")) { version = 15; + } else if (!strcmp(line, "version 16\n")) { + version = 16; } else { pr_err("Unsupported %s version: %s", path, line + 8); goto out_free_line; -- cgit v1.2.3 From 805da27252a290984782abfdb313a78e7c157369 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:27 +0000 Subject: perf sched stats: Add schedstat v17 support The /proc/schedstat file output is standardized with version number. Add support to record and raw dump v17 version layout. Version 17 of schedstats removed 'lb_imbalance' field as it has no significance anymore and instead added more relevant fields namely 'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and 'lb_imbalance_misfit'. The domain field prints the name of the corresponding sched domain from this version onwards. Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Makefile | 2 +- tools/lib/perf/include/perf/event.h | 14 +++ tools/lib/perf/include/perf/schedstat-v17.h | 164 ++++++++++++++++++++++++++++ tools/perf/util/event.c | 6 + tools/perf/util/synthetic-events.c | 11 ++ 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 tools/lib/perf/include/perf/schedstat-v17.h (limited to 'tools') diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile index 965e066fd780..27e6490f64dc 100644 --- a/tools/lib/perf/Makefile +++ b/tools/lib/perf/Makefile @@ -179,7 +179,7 @@ install_lib: libs cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ) HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h -HDRS += schedstat-v15.h schedstat-v16.h +HDRS += schedstat-v15.h schedstat-v16.h schedstat-v17.h INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index bd4d507ea8ab..9043dc72b5d6 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -508,6 +508,12 @@ struct perf_record_schedstat_cpu_v16 { #undef CPU_FIELD }; +struct perf_record_schedstat_cpu_v17 { +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) _type _name +#include "schedstat-v17.h" +#undef CPU_FIELD +}; + struct perf_record_schedstat_cpu { struct perf_event_header header; __u64 timestamp; @@ -518,6 +524,7 @@ struct perf_record_schedstat_cpu { union { struct perf_record_schedstat_cpu_v15 v15; struct perf_record_schedstat_cpu_v16 v16; + struct perf_record_schedstat_cpu_v17 v17; }; }; @@ -533,6 +540,12 @@ struct perf_record_schedstat_domain_v16 { #undef DOMAIN_FIELD }; +struct perf_record_schedstat_domain_v17 { +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) _type _name +#include "schedstat-v17.h" +#undef DOMAIN_FIELD +}; + #define DOMAIN_NAME_LEN 16 struct perf_record_schedstat_domain { @@ -544,6 +557,7 @@ struct perf_record_schedstat_domain { union { struct perf_record_schedstat_domain_v15 v15; struct perf_record_schedstat_domain_v16 v16; + struct perf_record_schedstat_domain_v17 v17; }; }; diff --git a/tools/lib/perf/include/perf/schedstat-v17.h b/tools/lib/perf/include/perf/schedstat-v17.h new file mode 100644 index 000000000000..865dc7c1039c --- /dev/null +++ b/tools/lib/perf/include/perf/schedstat-v17.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifdef CPU_FIELD +CPU_FIELD(__u32, yld_count, "sched_yield() count", + "%11u", false, yld_count, v17); +CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored", + "%11u", false, array_exp, v17); +CPU_FIELD(__u32, sched_count, "schedule() called", + "%11u", false, sched_count, v17); +CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle", + "%11u", true, sched_count, v17); +CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called", + "%11u", false, ttwu_count, v17); +CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu", + "%11u", true, ttwu_count, v17); +CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)", + "%11llu", false, rq_cpu_time, v17); +CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)", + "%11llu", true, rq_cpu_time, v17); +CPU_FIELD(__u64, pcount, "total timeslices run on this cpu", + "%11llu", false, pcount, v17); +#endif /* CPU_FIELD */ + +#ifdef DOMAIN_FIELD +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, busy_lb_count, + "load_balance() count on cpu busy", "%11u", true, v17); +DOMAIN_FIELD(__u32, busy_lb_balanced, + "load_balance() found balanced on cpu busy", "%11u", true, v17); +DOMAIN_FIELD(__u32, busy_lb_failed, + "load_balance() move task failed on cpu busy", "%11u", true, v17); +DOMAIN_FIELD(__u32, busy_lb_imbalance_load, + "imbalance in load on cpu busy", "%11u", false, v17); +DOMAIN_FIELD(__u32, busy_lb_imbalance_util, + "imbalance in utilization on cpu busy", "%11u", false, v17); +DOMAIN_FIELD(__u32, busy_lb_imbalance_task, + "imbalance in number of tasks on cpu busy", "%11u", false, v17); +DOMAIN_FIELD(__u32, busy_lb_imbalance_misfit, + "imbalance in misfit tasks on cpu busy", "%11u", false, v17); +DOMAIN_FIELD(__u32, busy_lb_gained, + "pull_task() count on cpu busy", "%11u", false, v17); +DOMAIN_FIELD(__u32, busy_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v17); +DOMAIN_FIELD(__u32, busy_lb_nobusyq, + "load_balance() failed to find busier queue on cpu busy", "%11u", true, v17); +DOMAIN_FIELD(__u32, busy_lb_nobusyg, + "load_balance() failed to find busier group on cpu busy", "%11u", true, v17); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u", + busy_lb_count, busy_lb_balanced, busy_lb_failed, v17); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(busy_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf", + busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v17); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, idle_lb_count, + "load_balance() count on cpu idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, idle_lb_balanced, + "load_balance() found balanced on cpu idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, idle_lb_failed, + "load_balance() move task failed on cpu idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, idle_lb_imbalance_load, + "imbalance in load on cpu idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, idle_lb_imbalance_util, + "imbalance in utilization on cpu idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, idle_lb_imbalance_task, + "imbalance in number of tasks on cpu idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, idle_lb_imbalance_misfit, + "imbalance in misfit tasks on cpu idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, idle_lb_gained, + "pull_task() count on cpu idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, idle_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, idle_lb_nobusyq, + "load_balance() failed to find busier queue on cpu idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, idle_lb_nobusyg, + "load_balance() failed to find busier group on cpu idle", "%11u", true, v17); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u", + idle_lb_count, idle_lb_balanced, idle_lb_failed, v17); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(idle_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf", + idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v17); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, newidle_lb_count, + "load_balance() count on cpu newly idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, newidle_lb_balanced, + "load_balance() found balanced on cpu newly idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, newidle_lb_failed, + "load_balance() move task failed on cpu newly idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, newidle_lb_imbalance_load, + "imbalance in load on cpu newly idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, newidle_lb_imbalance_util, + "imbalance in utilization on cpu newly idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, newidle_lb_imbalance_task, + "imbalance in number of tasks on cpu newly idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, newidle_lb_imbalance_misfit, + "imbalance in misfit tasks on cpu newly idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, newidle_lb_gained, + "pull_task() count on cpu newly idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, newidle_lb_hot_gained, + "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v17); +DOMAIN_FIELD(__u32, newidle_lb_nobusyq, + "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v17); +DOMAIN_FIELD(__u32, newidle_lb_nobusyg, + "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v17); +#ifdef DERIVED_CNT_FIELD +DERIVED_CNT_FIELD(newidle_lb_success_count, + "load_balance() success count on cpu newly idle", "%11u", + newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v17); +#endif +#ifdef DERIVED_AVG_FIELD +DERIVED_AVG_FIELD(newidle_lb_avg_pulled, + "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf", + newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v17); +#endif +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, alb_count, + "active_load_balance() count", "%11u", false, v17); +DOMAIN_FIELD(__u32, alb_failed, + "active_load_balance() move task failed", "%11u", false, v17); +DOMAIN_FIELD(__u32, alb_pushed, + "active_load_balance() successfully moved a task", "%11u", false, v17); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, sbe_count, + "sbe_count is not used", "%11u", false, v17); +DOMAIN_FIELD(__u32, sbe_balanced, + "sbe_balanced is not used", "%11u", false, v17); +DOMAIN_FIELD(__u32, sbe_pushed, + "sbe_pushed is not used", "%11u", false, v17); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, sbf_count, + "sbf_count is not used", "%11u", false, v17); +DOMAIN_FIELD(__u32, sbf_balanced, + "sbf_balanced is not used", "%11u", false, v17); +DOMAIN_FIELD(__u32, sbf_pushed, + "sbf_pushed is not used", "%11u", false, v17); +#ifdef DOMAIN_CATEGORY +DOMAIN_CATEGORY(" "); +#endif +DOMAIN_FIELD(__u32, ttwu_wake_remote, + "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v17); +DOMAIN_FIELD(__u32, ttwu_move_affine, + "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v17); +DOMAIN_FIELD(__u32, ttwu_move_balance, + "try_to_wake_up() started passive balancing", "%11u", false, v17); +#endif /* DOMAIN_FIELD */ diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 730021cec161..2dde1044b5a7 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -588,6 +588,9 @@ size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp) } else if (version == 16) { #include return size; + } else if (version == 17) { +#include + return size; } #undef CPU_FIELD @@ -610,6 +613,9 @@ size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp) } else if (version == 16) { #include return size; + } else if (version == 17) { +#include + return size; } #undef DOMAIN_FIELD diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 4ce37357db05..ef79433ebc3a 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -2569,6 +2569,8 @@ static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version #include } else if (version == 16) { #include + } else if (version == 17) { +#include } #undef CPU_FIELD @@ -2595,6 +2597,11 @@ static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 vers return NULL; ch = io__get_dec(io, &d_num); + if (version >= 17) { + /* Skip domain name as it can be extracted from perf header */ + while (io__get_char(io) != ' ') + continue; + } /* Skip cpumask as it can be extracted from perf header */ while (io__get_char(io) != ' ') @@ -2624,6 +2631,8 @@ static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 vers #include } else if (version == 16) { #include + } else if (version == 17) { +#include } #undef DOMAIN_FIELD @@ -2667,6 +2676,8 @@ int perf_event__synthesize_schedstat(const struct perf_tool *tool, version = 15; } else if (!strcmp(line, "version 16\n")) { version = 16; + } else if (!strcmp(line, "version 17\n")) { + version = 17; } else { pr_err("Unsupported %s version: %s", path, line + 8); goto out_free_line; -- cgit v1.2.3 From 5a357ae6ad63fd101a4f20d081f8893b51cc0790 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:28 +0000 Subject: perf sched stats: Add support for report subcommand `perf sched stats record` captures two sets of samples. For workload profile, first set right before workload starts and second set after workload finishes. For the systemwide profile, first set at the beginning of profile and second set on receiving SIGINT signal. Add `perf sched stats report` subcommand that will read both the set of samples, get the diff and render a final report. Final report prints scheduler stat at cpu granularity as well as sched domain granularity. Example usage: # ./perf sched stats record -- true [ perf sched stats: Wrote samples to perf.data ] # perf sched stats report Description ---------------------------------------------------------------------------------------------------- DESC -> Description of the field COUNT -> Value of the field PCT_CHANGE -> Percent change with corresponding base value AVG_JIFFIES -> Avg time in jiffies between two consecutive occurrence of event ---------------------------------------------------------------------------------------------------- Time elapsed (in jiffies) : 1 ---------------------------------------------------------------------------------------------------- CPU: ---------------------------------------------------------------------------------------------------- DESC COUNT PCT_CHANGE ---------------------------------------------------------------------------------------------------- yld_count : 0 array_exp : 0 sched_count : 0 sched_goidle : 0 ( 0.00% ) ttwu_count : 0 ttwu_local : 0 ( 0.00% ) rq_cpu_time : 33525 run_delay : 436 ( 1.30% ) pcount : 0 ---------------------------------------------------------------------------------------------------- CPU: | DOMAIN: SMT ---------------------------------------------------------------------------------------------------- DESC COUNT AVG_JIFFIES ----------------------------------------- ------------------------------------------ busy_lb_count : 0 $ 0.00 $ busy_lb_balanced : 0 $ 0.00 $ busy_lb_failed : 0 $ 0.00 $ busy_lb_imbalance_load : 0 busy_lb_imbalance_util : 0 busy_lb_imbalance_task : 0 busy_lb_imbalance_misfit : 0 busy_lb_gained : 0 busy_lb_hot_gained : 0 busy_lb_nobusyq : 0 $ 0.00 $ busy_lb_nobusyg : 0 $ 0.00 $ *busy_lb_success_count : 0 *busy_lb_avg_pulled : 0.00 ... and so on. Output shows similar data for all the cpus in the system. Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Tested-by: James Clark Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 509 ++++++++++++++++++++++++++++++++++++++++++++- tools/perf/util/util.c | 6 + tools/perf/util/util.h | 2 + 3 files changed, 515 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ee3b4e42156e..c6b054b9b12a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3929,6 +3929,503 @@ out: return err; } +struct schedstat_domain { + struct list_head domain_list; + struct perf_record_schedstat_domain *domain_data; +}; + +struct schedstat_cpu { + struct list_head cpu_list; + struct list_head domain_head; + struct perf_record_schedstat_cpu *cpu_data; +}; + +static struct list_head cpu_head = LIST_HEAD_INIT(cpu_head); +static struct schedstat_cpu *cpu_second_pass; +static struct schedstat_domain *domain_second_pass; +static bool after_workload_flag; +static bool verbose_field; + +static void store_schedtstat_cpu_diff(struct schedstat_cpu *after_workload) +{ + struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data; + struct perf_record_schedstat_cpu *after = after_workload->cpu_data; + __u16 version = after_workload->cpu_data->version; + +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ + (before->_ver._name = after->_ver._name - before->_ver._name) + + if (version == 15) { +#include + } else if (version == 16) { +#include + } else if (version == 17) { +#include + } + +#undef CPU_FIELD +} + +static void store_schedstat_domain_diff(struct schedstat_domain *after_workload) +{ + struct perf_record_schedstat_domain *before = domain_second_pass->domain_data; + struct perf_record_schedstat_domain *after = after_workload->domain_data; + __u16 version = after_workload->domain_data->version; + +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ + (before->_ver._name = after->_ver._name - before->_ver._name) + + if (version == 15) { +#include + } else if (version == 16) { +#include + } else if (version == 17) { +#include + } +#undef DOMAIN_FIELD +} + +static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs) +{ + printf("%-65s %12s %12s\n", "DESC", "COUNT", "PCT_CHANGE"); + printf("%.*s\n", 100, graph_dotted_line); + +#define CALC_PCT(_x, _y) ((_y) ? ((double)(_x) / (_y)) * 100 : 0.0) + +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ + do { \ + printf("%-65s: " _format, verbose_field ? _desc : #_name, \ + cs->_ver._name); \ + if (_is_pct) { \ + printf(" ( %8.2lf%% )", \ + CALC_PCT(cs->_ver._name, cs->_ver._pct_of)); \ + } \ + printf("\n"); \ + } while (0) + + if (cs->version == 15) { +#include + } else if (cs->version == 16) { +#include + } else if (cs->version == 17) { +#include + } + +#undef CPU_FIELD +#undef CALC_PCT +} + +static inline void print_domain_stats(struct perf_record_schedstat_domain *ds, + __u64 jiffies) +{ + printf("%-65s %12s %14s\n", "DESC", "COUNT", "AVG_JIFFIES"); + +#define DOMAIN_CATEGORY(_desc) \ + do { \ + size_t _len = strlen(_desc); \ + size_t _pre_dash_cnt = (100 - _len) / 2; \ + size_t _post_dash_cnt = 100 - _len - _pre_dash_cnt; \ + print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\ + } while (0) + +#define CALC_AVG(_x, _y) ((_y) ? (long double)(_x) / (_y) : 0.0) + +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ + do { \ + printf("%-65s: " _format, verbose_field ? _desc : #_name, \ + ds->_ver._name); \ + if (_is_jiffies) { \ + printf(" $ %11.2Lf $", \ + CALC_AVG(jiffies, ds->_ver._name)); \ + } \ + printf("\n"); \ + } while (0) + +#define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver) \ + printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name, \ + (ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z)) + +#define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver) \ + printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name, \ + CALC_AVG(ds->_ver._w, \ + ((ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z)))) + + if (ds->version == 15) { +#include + } else if (ds->version == 16) { +#include + } else if (ds->version == 17) { +#include + } + +#undef DERIVED_AVG_FIELD +#undef DERIVED_CNT_FIELD +#undef DOMAIN_FIELD +#undef CALC_AVG +#undef DOMAIN_CATEGORY +} + +static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu, + struct schedstat_cpu *cptr, + int cnt, bool is_last) +{ + struct perf_record_schedstat_cpu *summary_cs = summary_cpu->cpu_data, + *temp_cs = cptr->cpu_data; + +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ + do { \ + summary_cs->_ver._name += temp_cs->_ver._name; \ + if (is_last) \ + summary_cs->_ver._name /= cnt; \ + } while (0) + + if (cptr->cpu_data->version == 15) { +#include + } else if (cptr->cpu_data->version == 16) { +#include + } else if (cptr->cpu_data->version == 17) { +#include + } +#undef CPU_FIELD +} + +static void summarize_schedstat_domain(struct schedstat_domain *summary_domain, + struct schedstat_domain *dptr, + int cnt, bool is_last) +{ + struct perf_record_schedstat_domain *summary_ds = summary_domain->domain_data, + *temp_ds = dptr->domain_data; + +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ + do { \ + summary_ds->_ver._name += temp_ds->_ver._name; \ + if (is_last) \ + summary_ds->_ver._name /= cnt; \ + } while (0) + + if (dptr->domain_data->version == 15) { +#include + } else if (dptr->domain_data->version == 16) { +#include + } else if (dptr->domain_data->version == 17) { +#include + } +#undef DOMAIN_FIELD +} + +/* + * get_all_cpu_stats() appends the summary to the head of the list. + */ +static int get_all_cpu_stats(struct list_head *head) +{ + struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list); + struct schedstat_cpu *summary_head = NULL; + struct perf_record_schedstat_domain *ds; + struct perf_record_schedstat_cpu *cs; + struct schedstat_domain *dptr, *tdptr; + bool is_last = false; + int cnt = 1; + int ret = 0; + + if (cptr) { + summary_head = zalloc(sizeof(*summary_head)); + if (!summary_head) + return -ENOMEM; + + summary_head->cpu_data = zalloc(sizeof(*cs)); + memcpy(summary_head->cpu_data, cptr->cpu_data, sizeof(*cs)); + + INIT_LIST_HEAD(&summary_head->domain_head); + + list_for_each_entry(dptr, &cptr->domain_head, domain_list) { + tdptr = zalloc(sizeof(*tdptr)); + if (!tdptr) + return -ENOMEM; + + tdptr->domain_data = zalloc(sizeof(*ds)); + if (!tdptr->domain_data) + return -ENOMEM; + + memcpy(tdptr->domain_data, dptr->domain_data, sizeof(*ds)); + list_add_tail(&tdptr->domain_list, &summary_head->domain_head); + } + } + + list_for_each_entry(cptr, head, cpu_list) { + if (list_is_first(&cptr->cpu_list, head)) + continue; + + if (list_is_last(&cptr->cpu_list, head)) + is_last = true; + + cnt++; + summarize_schedstat_cpu(summary_head, cptr, cnt, is_last); + tdptr = list_first_entry(&summary_head->domain_head, struct schedstat_domain, + domain_list); + + list_for_each_entry(dptr, &cptr->domain_head, domain_list) { + summarize_schedstat_domain(tdptr, dptr, cnt, is_last); + tdptr = list_next_entry(tdptr, domain_list); + } + } + + list_add(&summary_head->cpu_list, head); + return ret; +} + +static int show_schedstat_data(struct list_head *head, struct cpu_domain_map **cd_map) +{ + struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list); + __u64 jiffies = cptr->cpu_data->timestamp; + struct perf_record_schedstat_domain *ds; + struct perf_record_schedstat_cpu *cs; + struct schedstat_domain *dptr; + bool is_summary = true; + int ret = 0; + + printf("Description\n"); + print_separator2(100, "", 0); + printf("%-30s-> %s\n", "DESC", "Description of the field"); + printf("%-30s-> %s\n", "COUNT", "Value of the field"); + printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value"); + printf("%-30s-> %s\n", "AVG_JIFFIES", + "Avg time in jiffies between two consecutive occurrence of event"); + + print_separator2(100, "", 0); + printf("\n"); + + printf("%-65s: %11llu\n", "Time elapsed (in jiffies)", jiffies); + + ret = get_all_cpu_stats(head); + + list_for_each_entry(cptr, head, cpu_list) { + cs = cptr->cpu_data; + print_separator2(100, "", 0); + + if (is_summary) + printf("CPU: \n"); + else + printf("CPU: %d\n", cs->cpu); + + print_separator2(100, "", 0); + print_cpu_stats(cs); + print_separator2(100, "", 0); + + list_for_each_entry(dptr, &cptr->domain_head, domain_list) { + struct domain_info *dinfo; + + ds = dptr->domain_data; + dinfo = cd_map[ds->cpu]->domains[ds->domain]; + if (is_summary) { + if (dinfo->dname) + printf("CPU: | DOMAIN: %s\n", + dinfo->dname); + else + printf("CPU: | DOMAIN: %d\n", + dinfo->domain); + } else { + if (dinfo->dname) + printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ", + cs->cpu, dinfo->dname); + else + printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ", + cs->cpu, dinfo->domain); + + printf("%s\n", dinfo->cpulist); + } + print_separator2(100, "", 0); + print_domain_stats(ds, jiffies); + print_separator2(100, "", 0); + } + is_summary = false; + } + return ret; +} + +/* + * Creates a linked list of cpu_data and domain_data. Below represents the structure of the linked + * list where CPU0,CPU1,CPU2, ..., CPU(N-1) stores the cpu_data. Here N is the total number of cpus. + * Each of the CPU points to the list of domain_data. Here DOMAIN0, DOMAIN1, DOMAIN2, ... represents + * the domain_data. Here D0, D1, D2, ..., Dm are the number of domains in the respective cpus. + * + * +----------+ + * | CPU_HEAD | + * +----------+ + * | + * v + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | CPU0 | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D0-1) | + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | + * v + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | CPU1 | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D1-1) | + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | + * v + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | CPU2 | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D2-1) | + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | + * v + * ... + * | + * v + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * | CPU(N-1) | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(Dm-1) | + * +----------+ +---------+ +---------+ +---------+ +--------------+ + * + * Each cpu as well as domain has 2 enties in the event list one before the workload starts and + * other after completion of the workload. The above linked list stores the diff of the cpu and + * domain statistics. + */ +static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_unused, + struct perf_session *session __maybe_unused, + union perf_event *event) +{ + struct perf_cpu this_cpu; + static __u32 initial_cpu; + + switch (event->header.type) { + case PERF_RECORD_SCHEDSTAT_CPU: + this_cpu.cpu = event->schedstat_cpu.cpu; + break; + case PERF_RECORD_SCHEDSTAT_DOMAIN: + this_cpu.cpu = event->schedstat_domain.cpu; + break; + default: + return 0; + } + + if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu)) + return 0; + + if (event->header.type == PERF_RECORD_SCHEDSTAT_CPU) { + struct schedstat_cpu *temp = zalloc(sizeof(*temp)); + + if (!temp) + return -ENOMEM; + + temp->cpu_data = zalloc(sizeof(*temp->cpu_data)); + if (!temp->cpu_data) + return -ENOMEM; + + memcpy(temp->cpu_data, &event->schedstat_cpu, sizeof(*temp->cpu_data)); + + if (!list_empty(&cpu_head) && temp->cpu_data->cpu == initial_cpu) + after_workload_flag = true; + + if (!after_workload_flag) { + if (list_empty(&cpu_head)) + initial_cpu = temp->cpu_data->cpu; + + list_add_tail(&temp->cpu_list, &cpu_head); + INIT_LIST_HEAD(&temp->domain_head); + } else { + if (temp->cpu_data->cpu == initial_cpu) { + cpu_second_pass = list_first_entry(&cpu_head, struct schedstat_cpu, + cpu_list); + cpu_second_pass->cpu_data->timestamp = + temp->cpu_data->timestamp - cpu_second_pass->cpu_data->timestamp; + } else { + cpu_second_pass = list_next_entry(cpu_second_pass, cpu_list); + } + domain_second_pass = list_first_entry(&cpu_second_pass->domain_head, + struct schedstat_domain, domain_list); + store_schedtstat_cpu_diff(temp); + } + } else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) { + struct schedstat_cpu *cpu_tail; + struct schedstat_domain *temp = zalloc(sizeof(*temp)); + + if (!temp) + return -ENOMEM; + + temp->domain_data = zalloc(sizeof(*temp->domain_data)); + if (!temp->domain_data) + return -ENOMEM; + + memcpy(temp->domain_data, &event->schedstat_domain, sizeof(*temp->domain_data)); + + if (!after_workload_flag) { + cpu_tail = list_last_entry(&cpu_head, struct schedstat_cpu, cpu_list); + list_add_tail(&temp->domain_list, &cpu_tail->domain_head); + } else { + store_schedstat_domain_diff(temp); + domain_second_pass = list_next_entry(domain_second_pass, domain_list); + } + } + + return 0; +} + +static void free_schedstat(struct list_head *head) +{ + struct schedstat_domain *dptr, *n1; + struct schedstat_cpu *cptr, *n2; + + list_for_each_entry_safe(cptr, n2, head, cpu_list) { + list_for_each_entry_safe(dptr, n1, &cptr->domain_head, domain_list) { + list_del_init(&dptr->domain_list); + free(dptr); + } + list_del_init(&cptr->cpu_list); + free(cptr); + } +} + +static int perf_sched__schedstat_report(struct perf_sched *sched) +{ + struct cpu_domain_map **cd_map; + struct perf_session *session; + struct target target = {}; + struct perf_data data = { + .path = input_name, + .mode = PERF_DATA_MODE_READ, + }; + int err = 0; + + sched->tool.schedstat_cpu = perf_sched__process_schedstat; + sched->tool.schedstat_domain = perf_sched__process_schedstat; + + session = perf_session__new(&data, &sched->tool); + if (IS_ERR(session)) { + pr_err("Perf session creation failed.\n"); + return PTR_ERR(session); + } + + if (cpu_list) + target.cpu_list = cpu_list; + else + target.system_wide = true; + + err = evlist__create_maps(session->evlist, &target); + if (err < 0) + goto out; + + user_requested_cpus = session->evlist->core.user_requested_cpus; + + err = perf_session__process_events(session); + + if (!err) { + setup_pager(); + + if (list_empty(&cpu_head)) { + pr_err("Data is not available\n"); + err = -1; + goto out; + } + + cd_map = session->header.env.cpu_domain; + err = show_schedstat_data(&cpu_head, cd_map); + } + +out: + free_schedstat(&cpu_head); + perf_session__delete(session); + return err; +} + static bool schedstat_events_exposed(void) { /* @@ -4106,9 +4603,12 @@ int cmd_sched(int argc, const char **argv) OPT_PARENT(sched_options) }; const struct option stats_options[] = { + OPT_STRING('i', "input", &input_name, "file", + "`stats report` with input filename"), OPT_STRING('o', "output", &output_name, "file", "`stats record` with output filename"), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_BOOLEAN('v', "verbose", &verbose_field, "Show explanation for fields in the report"), OPT_END() }; @@ -4129,7 +4629,7 @@ int cmd_sched(int argc, const char **argv) NULL }; const char *stats_usage[] = { - "perf sched stats {record} []", + "perf sched stats {record|report} []", NULL }; const char *const sched_subcommands[] = { "record", "latency", "map", @@ -4233,7 +4733,7 @@ int cmd_sched(int argc, const char **argv) if (!ret) ret = perf_sched__timehist(&sched); } else if (!strcmp(argv[0], "stats")) { - const char *const stats_subcommands[] = {"record", NULL}; + const char *const stats_subcommands[] = {"record", "report", NULL}; argc = parse_options_subcommand(argc, argv, stats_options, stats_subcommands, @@ -4245,6 +4745,11 @@ int cmd_sched(int argc, const char **argv) argc = parse_options(argc, argv, stats_options, stats_usage, 0); return perf_sched__schedstat_record(&sched, argc, argv); + } else if (argv[0] && !strcmp(argv[0], "report")) { + if (argc) + argc = parse_options(argc, argv, stats_options, + stats_usage, 0); + return perf_sched__schedstat_report(&sched); } usage_with_options(stats_usage, stats_options); } else { diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index b87ff96a9f45..03a603fbcd7d 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -299,6 +299,12 @@ free_bm: free(bm); } +void print_separator2(int pre_dash_cnt, const char *s, int post_dash_cnt) +{ + printf("%.*s%s%.*s\n", pre_dash_cnt, graph_dotted_line, s, post_dash_cnt, + graph_dotted_line); +} + int rm_rf_perf_data(const char *path) { const char *pat[] = { diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 1572c8cf04e5..394dbfa944ac 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -51,6 +51,8 @@ int perf_tip(char **strp, const char *dirpath); void cpumask_to_cpulist(char *cpumask, char *cpulist); +void print_separator2(int pre_dash_cnt, const char *s, int post_dash_cnt); + #ifndef HAVE_SCHED_GETCPU_SUPPORT int sched_getcpu(void); #endif -- cgit v1.2.3 From 00093b3133984ffe80697b5d2e7f204983660dd9 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:29 +0000 Subject: perf sched stats: Add support for live mode The live mode works similar to simple `perf stat` command, by profiling the target and printing results on the terminal as soon as the target finishes. Example usage: # perf sched stats -- true Description ---------------------------------------------------------------------------------------------------- DESC -> Description of the field COUNT -> Value of the field PCT_CHANGE -> Percent change with corresponding base value AVG_JIFFIES -> Avg time in jiffies between two consecutive occurrence of event ---------------------------------------------------------------------------------------------------- Time elapsed (in jiffies) : 1 ---------------------------------------------------------------------------------------------------- CPU: ---------------------------------------------------------------------------------------------------- DESC COUNT PCT_CHANGE ---------------------------------------------------------------------------------------------------- yld_count : 0 array_exp : 0 sched_count : 0 sched_goidle : 0 ( 0.00% ) ttwu_count : 0 ttwu_local : 0 ( 0.00% ) rq_cpu_time : 27875 run_delay : 0 ( 0.00% ) pcount : 0 ---------------------------------------------------------------------------------------------------- CPU: | DOMAIN: SMT ---------------------------------------------------------------------------------------------------- DESC COUNT AVG_JIFFIES ----------------------------------------- ------------------------------------------ busy_lb_count : 0 $ 0.00 $ busy_lb_balanced : 0 $ 0.00 $ busy_lb_failed : 0 $ 0.00 $ busy_lb_imbalance_load : 0 busy_lb_imbalance_util : 0 busy_lb_imbalance_task : 0 busy_lb_imbalance_misfit : 0 busy_lb_gained : 0 busy_lb_hot_gained : 0 busy_lb_nobusyq : 0 $ 0.00 $ busy_lb_nobusyg : 0 $ 0.00 $ *busy_lb_success_count : 0 *busy_lb_avg_pulled : 0.00 ... and so on. Output will show similar data for all the cpus in the system. Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Tested-by: James Clark Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han [ Avoid potentially using 'sv' uninitialized by calling free_cpu_domain_info() only when build_cpu_domain_map() is called ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 99 +++++++++++++++++++++++++++++++++++++++++++++- tools/perf/util/header.c | 3 +- tools/perf/util/header.h | 3 ++ 3 files changed, 102 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index c6b054b9b12a..ec9fa29196b2 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -4426,6 +4426,103 @@ out: return err; } +static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + return perf_sched__process_schedstat(tool, NULL, event); +} + +static int perf_sched__schedstat_live(struct perf_sched *sched, + int argc, const char **argv) +{ + struct cpu_domain_map **cd_map = NULL; + struct target target = {}; + u32 __maybe_unused md; + struct evlist *evlist; + u32 nr = 0, sv; + int reset = 0; + int err = 0; + + signal(SIGINT, sighandler); + signal(SIGCHLD, sighandler); + signal(SIGTERM, sighandler); + + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + + /* + * `perf sched schedstat` does not support workload profiling (-p pid) + * since /proc/schedstat file contains cpu specific data only. Hence, a + * profile target is either set of cpus or systemwide, never a process. + * Note that, although `-- ` is supported, profile data are + * still cpu/systemwide. + */ + if (cpu_list) + target.cpu_list = cpu_list; + else + target.system_wide = true; + + if (argc) { + err = evlist__prepare_workload(evlist, &target, argv, false, NULL); + if (err) + goto out; + } + + err = evlist__create_maps(evlist, &target); + if (err < 0) + goto out; + + user_requested_cpus = evlist->core.user_requested_cpus; + + err = perf_event__synthesize_schedstat(&(sched->tool), + process_synthesized_event_live, + user_requested_cpus); + if (err < 0) + goto out; + + err = enable_sched_schedstats(&reset); + if (err < 0) + goto out; + + if (argc) + evlist__start_workload(evlist); + + /* wait for signal */ + pause(); + + if (reset) { + err = disable_sched_schedstat(); + if (err < 0) + goto out; + } + + err = perf_event__synthesize_schedstat(&(sched->tool), + process_synthesized_event_live, + user_requested_cpus); + if (err) + goto out; + + setup_pager(); + + if (list_empty(&cpu_head)) { + pr_err("Data is not available\n"); + err = -1; + goto out; + } + + nr = cpu__max_present_cpu().cpu; + cd_map = build_cpu_domain_map(&sv, &md, nr); + show_schedstat_data(&cpu_head, cd_map); + free_cpu_domain_info(cd_map, sv, nr); +out: + free_schedstat(&cpu_head); + evlist__delete(evlist); + return err; +} + static bool schedstat_events_exposed(void) { /* @@ -4751,7 +4848,7 @@ int cmd_sched(int argc, const char **argv) stats_usage, 0); return perf_sched__schedstat_report(&sched); } - usage_with_options(stats_usage, stats_options); + return perf_sched__schedstat_live(&sched, argc, argv); } else { usage_with_options(sched_usage, sched_options); } diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 673d53bb2a2c..9a15dd4b7640 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1614,8 +1614,7 @@ static int write_pmu_caps(struct feat_fd *ff, return 0; } -static struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, - u32 nr) +struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, u32 nr) { struct domain_info *domain_info; struct cpu_domain_map **cd_map; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index c62f3275a80f..36cc74e2d14d 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -211,4 +211,7 @@ char *get_cpuid_str(struct perf_cpu cpu); char *get_cpuid_allow_env_override(struct perf_cpu cpu); int strcmp_cpuid_str(const char *s1, const char *s2); + +struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, + u32 nr); #endif /* __PERF_HEADER_H */ -- cgit v1.2.3 From 064790a3d4a89536d00a61d7a02de67ad319bdc5 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:30 +0000 Subject: perf sched stats: Add support for diff subcommand `perf sched stats diff` subcommand will take two perf.data files as an input and it will print the diff between the two perf.data files. The default input to this subcommnd is perf.data.old and perf.data. Example usage: # perf sched stats diff sample1.data sample2.data Description ---------------------------------------------------------------------------------------------------- DESC -> Description of the field COUNT -> Value of the field PCT_CHANGE -> Percent change with corresponding base value AVG_JIFFIES -> Avg time in jiffies between two consecutive occurrence of event ---------------------------------------------------------------------------------------------------- Time elapsed (in jiffies) : 1, 1 ---------------------------------------------------------------------------------------------------- CPU: ---------------------------------------------------------------------------------------------------- DESC COUNT1 COUNT2 PCT_CHANGE PCT_CHANGE1 PCT_CHANGE2 ---------------------------------------------------------------------------------------------------- yld_count : 0, 0 | 0.00% | array_exp : 0, 0 | 0.00% | sched_count : 0, 0 | 0.00% | sched_goidle : 0, 0 | 0.00% | ( 0.00%, 0.00% ) ttwu_count : 0, 0 | 0.00% | ttwu_local : 0, 0 | 0.00% | ( 0.00%, 0.00% ) rq_cpu_time : 32565, 33525 | 2.95% | run_delay : 0, 436 | 0.00% | ( 0.00%, 1.30% ) pcount : 0, 0 | 0.00% | ---------------------------------------------------------------------------------------------------- CPU: | DOMAIN: SMT ---------------------------------------------------------------------------------------------------- DESC COUNT1 COUNT2 PCT_CHANGE AVG_JIFFIES1 AVG_JIFFIES2 ----------------------------------------- ------------------------------------------ busy_lb_count : 0, 0 | 0.00% | $ 0.00, 0.00 $ busy_lb_balanced : 0, 0 | 0.00% | $ 0.00, 0.00 $ busy_lb_failed : 0, 0 | 0.00% | $ 0.00, 0.00 $ busy_lb_imbalance_load : 0, 0 | 0.00% | busy_lb_imbalance_util : 0, 0 | 0.00% | busy_lb_imbalance_task : 0, 0 | 0.00% | busy_lb_imbalance_misfit : 0, 0 | 0.00% | busy_lb_gained : 0, 0 | 0.00% | busy_lb_hot_gained : 0, 0 | 0.00% | busy_lb_nobusyq : 0, 0 | 0.00% | $ 0.00, 0.00 $ busy_lb_nobusyg : 0, 0 | 0.00% | $ 0.00, 0.00 $ *busy_lb_success_count : 0, 0 | 0.00% | *busy_lb_avg_pulled : 0.00, 0.00 | 0.00% | ... and so on. Output contains the diff of aggregated data of all the busy, idle and newidle categories for all the sched domains in the system. Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 316 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 260 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ec9fa29196b2..b190e928117c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3985,29 +3985,46 @@ static void store_schedstat_domain_diff(struct schedstat_domain *after_workload) #undef DOMAIN_FIELD } -static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs) +#define PCT_CHNG(_x, _y) ((_x) ? ((double)((double)(_y) - (_x)) / (_x)) * 100 : 0.0) +static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1, + struct perf_record_schedstat_cpu *cs2) { - printf("%-65s %12s %12s\n", "DESC", "COUNT", "PCT_CHANGE"); - printf("%.*s\n", 100, graph_dotted_line); + printf("%-65s ", "DESC"); + if (!cs2) + printf("%12s %12s", "COUNT", "PCT_CHANGE"); + else + printf("%12s %11s %12s %14s %10s", "COUNT1", "COUNT2", "PCT_CHANGE", + "PCT_CHANGE1", "PCT_CHANGE2"); + + printf("\n"); + print_separator2(100, "", 0); #define CALC_PCT(_x, _y) ((_y) ? ((double)(_x) / (_y)) * 100 : 0.0) -#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ - do { \ - printf("%-65s: " _format, verbose_field ? _desc : #_name, \ - cs->_ver._name); \ - if (_is_pct) { \ - printf(" ( %8.2lf%% )", \ - CALC_PCT(cs->_ver._name, cs->_ver._pct_of)); \ - } \ - printf("\n"); \ +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver) \ + do { \ + printf("%-65s: " _format, verbose_field ? _desc : #_name, \ + cs1->_ver._name); \ + if (!cs2) { \ + if (_is_pct) \ + printf(" ( %8.2lf%% )", \ + CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of)); \ + } else { \ + printf("," _format " | %8.2lf%% |", cs2->_ver._name, \ + PCT_CHNG(cs1->_ver._name, cs2->_ver._name)); \ + if (_is_pct) \ + printf(" ( %8.2lf%%, %8.2lf%% )", \ + CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of), \ + CALC_PCT(cs2->_ver._name, cs2->_ver._pct_of)); \ + } \ + printf("\n"); \ } while (0) - if (cs->version == 15) { + if (cs1->version == 15) { #include - } else if (cs->version == 16) { + } else if (cs1->version == 16) { #include - } else if (cs->version == 17) { + } else if (cs1->version == 17) { #include } @@ -4015,10 +4032,17 @@ static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs) #undef CALC_PCT } -static inline void print_domain_stats(struct perf_record_schedstat_domain *ds, - __u64 jiffies) +static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1, + struct perf_record_schedstat_domain *ds2, + __u64 jiffies1, __u64 jiffies2) { - printf("%-65s %12s %14s\n", "DESC", "COUNT", "AVG_JIFFIES"); + printf("%-65s ", "DESC"); + if (!ds2) + printf("%12s %14s", "COUNT", "AVG_JIFFIES"); + else + printf("%12s %11s %12s %16s %12s", "COUNT1", "COUNT2", "PCT_CHANGE", + "AVG_JIFFIES1", "AVG_JIFFIES2"); + printf("\n"); #define DOMAIN_CATEGORY(_desc) \ do { \ @@ -4033,28 +4057,54 @@ static inline void print_domain_stats(struct perf_record_schedstat_domain *ds, #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver) \ do { \ printf("%-65s: " _format, verbose_field ? _desc : #_name, \ - ds->_ver._name); \ - if (_is_jiffies) { \ - printf(" $ %11.2Lf $", \ - CALC_AVG(jiffies, ds->_ver._name)); \ + ds1->_ver._name); \ + if (!ds2) { \ + if (_is_jiffies) \ + printf(" $ %11.2Lf $", \ + CALC_AVG(jiffies1, ds1->_ver._name)); \ + } else { \ + printf("," _format " | %8.2lf%% |", ds2->_ver._name, \ + PCT_CHNG(ds1->_ver._name, ds2->_ver._name)); \ + if (_is_jiffies) \ + printf(" $ %11.2Lf, %11.2Lf $", \ + CALC_AVG(jiffies1, ds1->_ver._name), \ + CALC_AVG(jiffies2, ds2->_ver._name)); \ } \ printf("\n"); \ } while (0) #define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver) \ - printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name, \ - (ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z)) + do { \ + __u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z; \ + printf("*%-64s: " _format, verbose_field ? _desc : #_name, t1); \ + if (ds2) { \ + __u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z; \ + printf("," _format " | %8.2lf%% |", t2, \ + PCT_CHNG(t1, t2)); \ + } \ + printf("\n"); \ + } while (0) #define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver) \ - printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name, \ - CALC_AVG(ds->_ver._w, \ - ((ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z)))) + do { \ + __u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z; \ + printf("*%-64s: " _format, verbose_field ? _desc : #_name, \ + CALC_AVG(ds1->_ver._w, t1)); \ + if (ds2) { \ + __u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z; \ + printf("," _format " | %8.2Lf%% |", \ + CALC_AVG(ds2->_ver._w, t2), \ + PCT_CHNG(CALC_AVG(ds1->_ver._w, t1), \ + CALC_AVG(ds2->_ver._w, t2))); \ + } \ + printf("\n"); \ + } while (0) - if (ds->version == 15) { + if (ds1->version == 15) { #include - } else if (ds->version == 16) { + } else if (ds1->version == 16) { #include - } else if (ds->version == 17) { + } else if (ds1->version == 17) { #include } @@ -4064,6 +4114,7 @@ static inline void print_domain_stats(struct perf_record_schedstat_domain *ds, #undef CALC_AVG #undef DOMAIN_CATEGORY } +#undef PCT_CHNG static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu, struct schedstat_cpu *cptr, @@ -4173,13 +4224,16 @@ static int get_all_cpu_stats(struct list_head *head) return ret; } -static int show_schedstat_data(struct list_head *head, struct cpu_domain_map **cd_map) +static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **cd_map1, + struct list_head *head2, struct cpu_domain_map **cd_map2, + bool summary_only) { - struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list); - __u64 jiffies = cptr->cpu_data->timestamp; - struct perf_record_schedstat_domain *ds; - struct perf_record_schedstat_cpu *cs; - struct schedstat_domain *dptr; + struct schedstat_cpu *cptr1 = list_first_entry(head1, struct schedstat_cpu, cpu_list); + struct perf_record_schedstat_domain *ds1 = NULL, *ds2 = NULL; + struct perf_record_schedstat_cpu *cs1 = NULL, *cs2 = NULL; + struct schedstat_domain *dptr1 = NULL, *dptr2 = NULL; + struct schedstat_cpu *cptr2 = NULL; + __u64 jiffies1 = 0, jiffies2 = 0; bool is_summary = true; int ret = 0; @@ -4194,49 +4248,100 @@ static int show_schedstat_data(struct list_head *head, struct cpu_domain_map **c print_separator2(100, "", 0); printf("\n"); - printf("%-65s: %11llu\n", "Time elapsed (in jiffies)", jiffies); + printf("%-65s: ", "Time elapsed (in jiffies)"); + jiffies1 = cptr1->cpu_data->timestamp; + printf("%11llu", jiffies1); + if (head2) { + cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list); + jiffies2 = cptr2->cpu_data->timestamp; + printf(",%11llu", jiffies2); + } + printf("\n"); + + ret = get_all_cpu_stats(head1); + if (cptr2) { + ret = get_all_cpu_stats(head2); + cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list); + } - ret = get_all_cpu_stats(head); + list_for_each_entry(cptr1, head1, cpu_list) { + struct cpu_domain_map *cd_info1 = NULL, *cd_info2 = NULL; + + cs1 = cptr1->cpu_data; + cd_info1 = cd_map1[cs1->cpu]; + if (cptr2) { + cs2 = cptr2->cpu_data; + cd_info2 = cd_map2[cs2->cpu]; + dptr2 = list_first_entry(&cptr2->domain_head, struct schedstat_domain, + domain_list); + } + + if (cs2 && cs1->cpu != cs2->cpu) { + pr_err("Failed because matching cpus not found for diff\n"); + return -1; + } + + if (cd_info2 && cd_info1->nr_domains != cd_info2->nr_domains) { + pr_err("Failed because nr_domains is not same for cpus\n"); + return -1; + } - list_for_each_entry(cptr, head, cpu_list) { - cs = cptr->cpu_data; print_separator2(100, "", 0); if (is_summary) printf("CPU: \n"); else - printf("CPU: %d\n", cs->cpu); + printf("CPU: %d\n", cs1->cpu); print_separator2(100, "", 0); - print_cpu_stats(cs); + print_cpu_stats(cs1, cs2); print_separator2(100, "", 0); - list_for_each_entry(dptr, &cptr->domain_head, domain_list) { - struct domain_info *dinfo; + list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) { + struct domain_info *dinfo1 = NULL, *dinfo2 = NULL; + + ds1 = dptr1->domain_data; + dinfo1 = cd_info1->domains[ds1->domain]; + if (dptr2) { + ds2 = dptr2->domain_data; + dinfo2 = cd_info2->domains[ds2->domain]; + } + + if (dinfo2 && dinfo1->domain != dinfo2->domain) { + pr_err("Failed because matching domain not found for diff\n"); + return -1; + } - ds = dptr->domain_data; - dinfo = cd_map[ds->cpu]->domains[ds->domain]; if (is_summary) { - if (dinfo->dname) + if (dinfo1->dname) printf("CPU: | DOMAIN: %s\n", - dinfo->dname); + dinfo1->dname); else printf("CPU: | DOMAIN: %d\n", - dinfo->domain); + dinfo1->domain); } else { - if (dinfo->dname) + if (dinfo1->dname) printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ", - cs->cpu, dinfo->dname); + cs1->cpu, dinfo1->dname); else printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ", - cs->cpu, dinfo->domain); + cs1->cpu, dinfo1->domain); - printf("%s\n", dinfo->cpulist); + printf("%s\n", dinfo1->cpulist); } print_separator2(100, "", 0); - print_domain_stats(ds, jiffies); + print_domain_stats(ds1, ds2, jiffies1, jiffies2); print_separator2(100, "", 0); + + if (dptr2) + dptr2 = list_next_entry(dptr2, domain_list); } + if (summary_only) + break; + + if (cptr2) + cptr2 = list_next_entry(cptr2, cpu_list); + is_summary = false; } return ret; @@ -4417,7 +4522,7 @@ static int perf_sched__schedstat_report(struct perf_sched *sched) } cd_map = session->header.env.cpu_domain; - err = show_schedstat_data(&cpu_head, cd_map); + err = show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false); } out: @@ -4426,6 +4531,100 @@ out: return err; } +static int perf_sched__schedstat_diff(struct perf_sched *sched, + int argc, const char **argv) +{ + struct cpu_domain_map **cd_map0 = NULL, **cd_map1 = NULL; + struct list_head cpu_head_ses0, cpu_head_ses1; + struct perf_session *session[2]; + struct perf_data data[2]; + int ret = 0, err = 0; + static const char *defaults[] = { + "perf.data.old", + "perf.data", + }; + + if (argc) { + if (argc == 1) + defaults[1] = argv[0]; + else if (argc == 2) { + defaults[0] = argv[0]; + defaults[1] = argv[1]; + } else { + pr_err("perf sched stats diff is not supported with more than 2 files.\n"); + goto out_ret; + } + } + + INIT_LIST_HEAD(&cpu_head_ses0); + INIT_LIST_HEAD(&cpu_head_ses1); + + sched->tool.schedstat_cpu = perf_sched__process_schedstat; + sched->tool.schedstat_domain = perf_sched__process_schedstat; + + data[0].path = defaults[0]; + data[0].mode = PERF_DATA_MODE_READ; + session[0] = perf_session__new(&data[0], &sched->tool); + if (IS_ERR(session[0])) { + ret = PTR_ERR(session[0]); + pr_err("Failed to open %s\n", data[0].path); + goto out_delete_ses0; + } + + err = perf_session__process_events(session[0]); + if (err) + goto out_delete_ses0; + + cd_map0 = session[0]->header.env.cpu_domain; + list_replace_init(&cpu_head, &cpu_head_ses0); + after_workload_flag = false; + + data[1].path = defaults[1]; + data[1].mode = PERF_DATA_MODE_READ; + session[1] = perf_session__new(&data[1], &sched->tool); + if (IS_ERR(session[1])) { + ret = PTR_ERR(session[1]); + pr_err("Failed to open %s\n", data[1].path); + goto out_delete_ses1; + } + + err = perf_session__process_events(session[1]); + if (err) + goto out_delete_ses1; + + cd_map1 = session[1]->header.env.cpu_domain; + list_replace_init(&cpu_head, &cpu_head_ses1); + after_workload_flag = false; + setup_pager(); + + if (list_empty(&cpu_head_ses1)) { + pr_err("Data is not available\n"); + ret = -1; + goto out_delete_ses1; + } + + if (list_empty(&cpu_head_ses0)) { + pr_err("Data is not available\n"); + ret = -1; + goto out_delete_ses0; + } + + show_schedstat_data(&cpu_head_ses0, cd_map0, &cpu_head_ses1, cd_map1, true); + +out_delete_ses1: + free_schedstat(&cpu_head_ses1); + if (!IS_ERR(session[1])) + perf_session__delete(session[1]); + +out_delete_ses0: + free_schedstat(&cpu_head_ses0); + if (!IS_ERR(session[0])) + perf_session__delete(session[0]); + +out_ret: + return ret; +} + static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, @@ -4515,7 +4714,7 @@ static int perf_sched__schedstat_live(struct perf_sched *sched, nr = cpu__max_present_cpu().cpu; cd_map = build_cpu_domain_map(&sv, &md, nr); - show_schedstat_data(&cpu_head, cd_map); + show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false); free_cpu_domain_info(cd_map, sv, nr); out: free_schedstat(&cpu_head); @@ -4847,6 +5046,11 @@ int cmd_sched(int argc, const char **argv) argc = parse_options(argc, argv, stats_options, stats_usage, 0); return perf_sched__schedstat_report(&sched); + } else if (argv[0] && !strcmp(argv[0], "diff")) { + if (argc) + argc = parse_options(argc, argv, stats_options, + stats_usage, 0); + return perf_sched__schedstat_diff(&sched, argc, argv); } return perf_sched__schedstat_live(&sched, argc, argv); } else { -- cgit v1.2.3 From c6b1f5707509c2718832a8f79e1d1510c85bcc75 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:31 +0000 Subject: perf sched stats: Add basic 'perf sched stats' test Add basic test for 'perf sched stats {record|report|diff}' subcommand. Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Link: https://lore.kernel.org/r/20260119175833.340369-10-swapnil.sapkal@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/perf_sched_stats.sh | 64 ++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 tools/perf/tests/shell/perf_sched_stats.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/perf_sched_stats.sh b/tools/perf/tests/shell/perf_sched_stats.sh new file mode 100755 index 000000000000..2b1410b050d0 --- /dev/null +++ b/tools/perf/tests/shell/perf_sched_stats.sh @@ -0,0 +1,64 @@ +#!/bin/sh +# perf sched stats tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +err=0 +test_perf_sched_stats_record() { + echo "Basic perf sched stats record test" + if ! perf sched stats record true 2>&1 | \ + grep -E -q "[ perf sched stats: Wrote samples to perf.data ]" + then + echo "Basic perf sched stats record test [Failed]" + err=1 + return + fi + echo "Basic perf sched stats record test [Success]" +} + +test_perf_sched_stats_report() { + echo "Basic perf sched stats report test" + perf sched stats record true > /dev/null + if ! perf sched stats report 2>&1 | grep -E -q "Description" + then + echo "Basic perf sched stats report test [Failed]" + err=1 + rm perf.data + return + fi + rm perf.data + echo "Basic perf sched stats report test [Success]" +} + +test_perf_sched_stats_live() { + echo "Basic perf sched stats live mode test" + if ! perf sched stats true 2>&1 | grep -E -q "Description" + then + echo "Basic perf sched stats live mode test [Failed]" + err=1 + return + fi + echo "Basic perf sched stats live mode test [Success]" +} + +test_perf_sched_stats_diff() { + echo "Basic perf sched stats diff test" + perf sched stats record true > /dev/null + perf sched stats record true > /dev/null + if ! perf sched stats diff > /dev/null + then + echo "Basic perf sched stats diff test [Failed]" + err=1 + rm perf.data.old perf.data + return + fi + rm perf.data.old perf.data + echo "Basic perf sched stats diff test [Success]" +} + +test_perf_sched_stats_record +test_perf_sched_stats_report +test_perf_sched_stats_live +test_perf_sched_stats_diff +exit $err -- cgit v1.2.3 From 800af362d68945e589f73cda429d04bfe4287feb Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 19 Jan 2026 17:58:32 +0000 Subject: perf sched stats: Add details in man page Document 'perf sched stats' purpose, usage examples and guide on how to interpret the report data in the perf-sched man page. Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Tested-by: Chen Yu Acked-by: Ian Rogers Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anubhav Shelat Cc: Ben Gainey Cc: Blake Jones Cc: Chun-Tse Shao Cc: David Vernet Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Gautham Shenoy Cc: Graham Woodward Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Juri Lelli Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Madadi Vineeth Reddy Cc: Mark Rutland Cc: Namhyung Kim Cc: Sandipan Das Cc: Santosh Shukla Cc: Shrikanth Hegde Cc: Steven Rostedt (VMware) Cc: Tejun Heo Cc: Thomas Falcon Cc: Tim Chen Cc: Vincent Guittot Cc: Yang Jihong Cc: Yujie Liu Cc: Zhongqiu Han Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 261 +++++++++++++++++++++++++++++++- 1 file changed, 260 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 6dbbddb6464d..5bfb7bb6c633 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -8,7 +8,7 @@ perf-sched - Tool to trace/measure scheduler properties (latencies) SYNOPSIS -------- [verse] -'perf sched' {record|latency|map|replay|script|timehist} +'perf sched' {record|latency|map|replay|script|timehist|stats} DESCRIPTION ----------- @@ -80,8 +80,267 @@ There are several variants of 'perf sched': Times are in msec.usec. + 'perf sched stats {record | report | diff} ' to capture, report the diff + in schedstat counters and show the difference between perf sched stats report + respectively. schedstat counters which are present in the linux kernel and are + exposed through the file ``/proc/schedstat``. These counters are enabled or disabled + via the sysctl governed by the file ``/proc/sys/kernel/sched_schedstats``. These + counters accounts for many scheduler events such as ``schedule()`` calls, load-balancing + events, ``try_to_wakeup()`` call among others. This is useful in understading the + scheduler behavior for the workload. + + Note: The tool will not give correct results if there is topological reordering or + online/offline of cpus in between capturing snapshots of `/proc/schedstat`. + + Example usage: + perf sched stats record -- sleep 1 + perf sched stats report + perf sched stats diff + + A detailed description of the schedstats can be found in the Kernel Documentation: + https://www.kernel.org/doc/html/latest/scheduler/sched-stats.html + + The result can be interprested as follows: + + The `perf sched stats report` starts with description of the columns present in + the report. These column names are given before cpu and domain stats to improve + the readability of the report. + + ---------------------------------------------------------------------------------------------------- + DESC -> Description of the field + COUNT -> Value of the field + PCT_CHANGE -> Percent change with corresponding base value + AVG_JIFFIES -> Avg time in jiffies between two consecutive occurrence of event + ---------------------------------------------------------------------------------------------------- + + Next is the total profiling time in terms of jiffies: + + ---------------------------------------------------------------------------------------------------- + Time elapsed (in jiffies) : 24537 + ---------------------------------------------------------------------------------------------------- + + Next is CPU scheduling statistics. These are simple diffs of /proc/schedstat CPU lines + along with description. The report also prints % relative to base stat. + + In the example below, schedule() left the CPU0 idle 36.58% of the time. 0.45% of total + try_to_wake_up() was to wakeup local CPU. And, the total waittime by tasks on CPU0 is + 48.70% of the total runtime by tasks on the same CPU. + + ---------------------------------------------------------------------------------------------------- + CPU 0 + ---------------------------------------------------------------------------------------------------- + DESC COUNT PCT_CHANGE + ---------------------------------------------------------------------------------------------------- + yld_count : 0 + array_exp : 0 + sched_count : 402267 + sched_goidle : 147161 ( 36.58% ) + ttwu_count : 236309 + ttwu_local : 1062 ( 0.45% ) + rq_cpu_time : 7083791148 + run_delay : 3449973971 ( 48.70% ) + pcount : 255035 + ---------------------------------------------------------------------------------------------------- + + Next is load balancing statistics. For each of the sched domains + (eg: `SMT`, `MC`, `DIE`...), the scheduler computes statistics under + the following three categories: + + 1) Idle Load Balance: Load balancing performed on behalf of a long + idling CPU by some other CPU. + 2) Busy Load Balance: Load balancing performed when the CPU was busy. + 3) New Idle Balance : Load balancing performed when a CPU just became + idle. + + Under each of these three categories, sched stats report provides + different load balancing statistics. Along with direct stats, the + report also contains derived metrics prefixed with *. Example: + + ---------------------------------------------------------------------------------------------------- + CPU 0, DOMAIN SMT CPUS 0,64 + ---------------------------------------------------------------------------------------------------- + DESC COUNT AVG_JIFFIES + ----------------------------------------- ------------------------------------------ + busy_lb_count : 136 $ 17.08 $ + busy_lb_balanced : 131 $ 17.73 $ + busy_lb_failed : 0 $ 0.00 $ + busy_lb_imbalance_load : 58 + busy_lb_imbalance_util : 0 + busy_lb_imbalance_task : 0 + busy_lb_imbalance_misfit : 0 + busy_lb_gained : 7 + busy_lb_hot_gained : 0 + busy_lb_nobusyq : 2 $ 1161.50 $ + busy_lb_nobusyg : 129 $ 18.01 $ + *busy_lb_success_count : 5 + *busy_lb_avg_pulled : 1.40 + ----------------------------------------- ------------------------------------------ + idle_lb_count : 449 $ 5.17 $ + idle_lb_balanced : 382 $ 6.08 $ + idle_lb_failed : 3 $ 774.33 $ + idle_lb_imbalance_load : 0 + idle_lb_imbalance_util : 0 + idle_lb_imbalance_task : 71 + idle_lb_imbalance_misfit : 0 + idle_lb_gained : 67 + idle_lb_hot_gained : 0 + idle_lb_nobusyq : 0 $ 0.00 $ + idle_lb_nobusyg : 382 $ 6.08 $ + *idle_lb_success_count : 64 + *idle_lb_avg_pulled : 1.05 + ---------------------------------------- ---------------------------------------- + newidle_lb_count : 30471 $ 0.08 $ + newidle_lb_balanced : 28490 $ 0.08 $ + newidle_lb_failed : 633 $ 3.67 $ + newidle_lb_imbalance_load : 0 + newidle_lb_imbalance_util : 0 + newidle_lb_imbalance_task : 2040 + newidle_lb_imbalance_misfit : 0 + newidle_lb_gained : 1348 + newidle_lb_hot_gained : 0 + newidle_lb_nobusyq : 6 $ 387.17 $ + newidle_lb_nobusyg : 26634 $ 0.09 $ + *newidle_lb_success_count : 1348 + *newidle_lb_avg_pulled : 1.00 + ---------------------------------------------------------------------------------------------------- + + Consider following line: + + newidle_lb_balanced : 28490 $ 0.08 $ + + While profiling was active, the load-balancer found 28490 times the load + needs to be balanced on a newly idle CPU 0. Following value encapsulated + inside $ is average jiffies between two events (28490 / 24537 = 0.08). + + Next are active_load_balance() stats. alb did not trigger while the + profiling was active, hence it's all 0s. + + --------------------------------- --------------------------------- + alb_count : 0 + alb_failed : 0 + alb_pushed : 0 + ---------------------------------------------------------------------------------------------------- + + Next are sched_balance_exec() and sched_balance_fork() stats. They are + not used but we kept it in RFC just for legacy purpose. Unless opposed, + we plan to remove them in next revision. + + Next are wakeup statistics. For every domain, the report also shows + task-wakeup statistics. Example: + + ------------------------------------------ ------------------------------------------- + ttwu_wake_remote : 1590 + ttwu_move_affine : 84 + ttwu_move_balance : 0 + ---------------------------------------------------------------------------------------------------- + + Same set of stats are reported for each CPU and each domain level. + + How to interpret the diff + ~~~~~~~~~~~~~~~~~~~~~~~~~ + + The `perf sched stats diff` will also start with explaining the columns + present in the diff. Then it will show the diff in time in terms of + jiffies. The order of the values depends on the order of input data + files. It will take `perf.data.old` and `perf.data` respectively as the + defaults for comparison. Example: + + ---------------------------------------------------------------------------------------------------- + Time elapsed (in jiffies) : 2009, 2001 + ---------------------------------------------------------------------------------------------------- + + Below is the sample representing the difference in cpu and domain stats of + two runs. Here third column or the values enclosed in `|...|` shows the + percent change between the two. Second and fourth columns shows the + side-by-side representions of the corresponding fields from `perf sched + stats report`. + + ---------------------------------------------------------------------------------------------------- + CPU + ---------------------------------------------------------------------------------------------------- + DESC COUNT1 COUNT2 PCT_CHANG> + ---------------------------------------------------------------------------------------------------- + yld_count : 0, 0 | 0.00> + array_exp : 0, 0 | 0.00> + sched_count : 528533, 412573 | -21.94> + sched_goidle : 193426, 146082 | -24.48> + ttwu_count : 313134, 385975 | 23.26> + ttwu_local : 1126, 1282 | 13.85> + rq_cpu_time : 8257200244, 8301250047 | 0.53> + run_delay : 4728347053, 3997100703 | -15.47> + pcount : 335031, 266396 | -20.49> + ---------------------------------------------------------------------------------------------------- + + Below is the sample of domain stats diff: + + ---------------------------------------------------------------------------------------------------- + CPU , DOMAIN SMT + ---------------------------------------------------------------------------------------------------- + DESC COUNT1 COUNT2 PCT_CHANG> + ----------------------------------------- ------------------------------------------ + busy_lb_count : 122, 80 | -34.43> + busy_lb_balanced : 115, 76 | -33.91> + busy_lb_failed : 1, 3 | 200.00> + busy_lb_imbalance_load : 35, 49 | 40.00> + busy_lb_imbalance_util : 0, 0 | 0.00> + busy_lb_imbalance_task : 0, 0 | 0.00> + busy_lb_imbalance_misfit : 0, 0 | 0.00> + busy_lb_gained : 7, 2 | -71.43> + busy_lb_hot_gained : 0, 0 | 0.00> + busy_lb_nobusyq : 0, 0 | 0.00> + busy_lb_nobusyg : 115, 76 | -33.91> + *busy_lb_success_count : 6, 1 | -83.33> + *busy_lb_avg_pulled : 1.17, 2.00 | 71.43> + ----------------------------------------- ------------------------------------------ + idle_lb_count : 568, 620 | 9.15> + idle_lb_balanced : 462, 449 | -2.81> + idle_lb_failed : 11, 21 | 90.91> + idle_lb_imbalance_load : 0, 0 | 0.00> + idle_lb_imbalance_util : 0, 0 | 0.00> + idle_lb_imbalance_task : 115, 189 | 64.35> + idle_lb_imbalance_misfit : 0, 0 | 0.00> + idle_lb_gained : 103, 169 | 64.08> + idle_lb_hot_gained : 0, 0 | 0.00> + idle_lb_nobusyq : 0, 0 | 0.00> + idle_lb_nobusyg : 462, 449 | -2.81> + *idle_lb_success_count : 95, 150 | 57.89> + *idle_lb_avg_pulled : 1.08, 1.13 | 3.92> + ---------------------------------------- ---------------------------------------- + newidle_lb_count : 16961, 3155 | -81.40> + newidle_lb_balanced : 15646, 2556 | -83.66> + newidle_lb_failed : 397, 142 | -64.23> + newidle_lb_imbalance_load : 0, 0 | 0.00> + newidle_lb_imbalance_util : 0, 0 | 0.00> + newidle_lb_imbalance_task : 1376, 655 | -52.40> + newidle_lb_imbalance_misfit : 0, 0 | 0.00> + newidle_lb_gained : 917, 457 | -50.16> + newidle_lb_hot_gained : 0, 0 | 0.00> + newidle_lb_nobusyq : 3, 1 | -66.67> + newidle_lb_nobusyg : 14480, 2103 | -85.48> + *newidle_lb_success_count : 918, 457 | -50.22> + *newidle_lb_avg_pulled : 1.00, 1.00 | 0.11> + --------------------------------- --------------------------------- + alb_count : 0, 1 | 0.00> + alb_failed : 0, 0 | 0.00> + alb_pushed : 0, 1 | 0.00> + --------------------------------- ---------------------------------- + sbe_count : 0, 0 | 0.00> + sbe_balanced : 0, 0 | 0.00> + sbe_pushed : 0, 0 | 0.00> + --------------------------------- ---------------------------------- + sbf_count : 0, 0 | 0.00> + sbf_balanced : 0, 0 | 0.00> + sbf_pushed : 0, 0 | 0.00> + ------------------------------------------ ------------------------------------------- + ttwu_wake_remote : 2031, 2914 | 43.48> + ttwu_move_affine : 73, 124 | 69.86> + ttwu_move_balance : 0, 0 | 0.00> + ---------------------------------------------------------------------------------------------------- + OPTIONS ------- +Applicable to {record|latency|map|replay|script} + -i:: --input=:: Input file name. (default: perf.data unless stdin is a fifo) -- cgit v1.2.3 From 7ff8b1d60881c5f97b5ae426e14d2822917d3b69 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 14 Jan 2026 12:20:28 -0600 Subject: cxl/pci: Remove CXL VH handling in CONFIG_PCIEAER_CXL conditional blocks from core/pci.c Create new config CONFIG_CXL_RAS and put all CXL RAS items behind the config. The config will depend on CPER and PCIE AER to build. Move the related VH RAS code from core/pci.c to core/ras.c. Restricted CXL host (RCH) RAS functions will be moved in a future patch. Cc: Robert Richter Reviewed-by: Joshua Hahn Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Reviewed-by: Alison Schofield Co-developed-by: Terry Bowman Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-8-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/Kconfig | 4 + drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/core.h | 31 ++++++++ drivers/cxl/core/pci.c | 189 +--------------------------------------------- drivers/cxl/core/ras.c | 176 ++++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 8 -- drivers/cxl/cxlpci.h | 16 ++++ tools/testing/cxl/Kbuild | 2 +- 8 files changed, 233 insertions(+), 195 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 48b7314afdb8..217888992c88 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -233,4 +233,8 @@ config CXL_MCE def_bool y depends on X86_MCE && MEMORY_FAILURE +config CXL_RAS + def_bool y + depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 5ad8fef210b5..b2930cc54f8b 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,9 +14,9 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o -cxl_core-y += ras.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o +cxl_core-$(CONFIG_CXL_RAS) += ras.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1fb66132b777..bc818de87ccc 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -144,8 +144,39 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +#ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +#else +static inline int cxl_ras_init(void) +{ + return 0; +} + +static inline void cxl_ras_exit(void) +{ +} + +static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + return false; +} +static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } +#endif /* CONFIG_CXL_RAS */ + +/* Restricted CXL Host specific RAS functions */ +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); +#else +static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } +static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } +static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +#endif /* CONFIG_CXL_RAS */ + int cxl_gpf_port_setup(struct cxl_dport *dport); struct cxl_hdm; diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 51bb0f372e40..e132fff80979 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,81 +632,8 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - void __iomem *addr; - u32 status; - - if (!ras_base) - return; - - addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); - } -} - -/* CXL spec rev3.0 8.2.4.16.1 */ -static void header_log_copy(void __iomem *ras_base, u32 *log) -{ - void __iomem *addr; - u32 *log_addr; - int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); - - addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; - log_addr = log; - - for (i = 0; i < log_u32_size; i++) { - *log_addr = readl(addr); - log_addr++; - addr += sizeof(u32); - } -} - -/* - * Log the state of the RAS status registers and prepare them to log the - * next error status. Return 1 if reset needed. - */ -static bool cxl_handle_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - u32 hl[CXL_HEADERLOG_SIZE_U32]; - void __iomem *addr; - u32 status; - u32 fe; - - if (!ras_base) - return false; - - addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) - return false; - - /* If multiple errors, log header points to first error from ctrl reg */ - if (hweight32(status) > 1) { - void __iomem *rcc_addr = - ras_base + CXL_RAS_CAP_CONTROL_OFFSET; - - fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, - readl(rcc_addr))); - } else { - fe = status; - } - - header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); - writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); - - return true; -} - -#ifdef CONFIG_PCIEAER_CXL - -static void cxl_dport_map_rch_aer(struct cxl_dport *dport) +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport) { resource_size_t aer_phys; struct device *host; @@ -721,19 +648,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport) } } -static void cxl_dport_map_ras(struct cxl_dport *dport) -{ - struct cxl_register_map *map = &dport->reg_map; - struct device *dev = dport->dport_dev; - - if (!map->component_map.ras.valid) - dev_dbg(dev, "RAS registers not found\n"); - else if (cxl_map_component_regs(map, &dport->regs.component, - BIT(CXL_CM_CAP_CAP_ID_RAS))) - dev_dbg(dev, "Failed to map RAS capability.\n"); -} - -static void cxl_disable_rch_root_ints(struct cxl_dport *dport) +void cxl_disable_rch_root_ints(struct cxl_dport *dport) { void __iomem *aer_base = dport->regs.dport_aer; u32 aer_cmd_mask, aer_cmd; @@ -757,28 +672,6 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); } -/** - * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport - * @dport: the cxl_dport that needs to be initialized - * @host: host device for devm operations - */ -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) -{ - dport->reg_map.host = host; - cxl_dport_map_ras(dport); - - if (dport->rch) { - struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); - - if (!host_bridge->native_aer) - return; - - cxl_dport_map_rch_aer(dport); - cxl_disable_rch_root_ints(dport); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); - /* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the @@ -827,7 +720,7 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, return false; } -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); struct aer_capability_regs aer_regs; @@ -852,82 +745,8 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) else cxl_handle_ras(cxlds, dport->regs.ras); } - -#else -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } #endif -void cxl_cor_error_detected(struct pci_dev *pdev) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct device *dev = &cxlds->cxlmd->dev; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - - cxl_handle_cor_ras(cxlds, cxlds->regs.ras); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); - -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, - pci_channel_state_t state) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - bool ue; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return PCI_ERS_RESULT_DISCONNECT; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_handle_ras(cxlds, cxlds->regs.ras); - } - - - switch (state) { - case pci_channel_io_normal: - if (ue) { - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - } - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - dev_warn(&pdev->dev, - "%s: frozen state error detected, disable CXL.mem\n", - dev_name(dev)); - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: - dev_warn(&pdev->dev, - "failure state error detected, request disconnect\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - return PCI_ERS_RESULT_NEED_RESET; -} -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 2731ba3a0799..b933030b8e1e 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "trace.h" static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, @@ -124,3 +125,178 @@ void cxl_ras_exit(void) cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); cancel_work_sync(&cxl_cper_prot_err_work); } + +static void cxl_dport_map_ras(struct cxl_dport *dport) +{ + struct cxl_register_map *map = &dport->reg_map; + struct device *dev = dport->dport_dev; + + if (!map->component_map.ras.valid) + dev_dbg(dev, "RAS registers not found\n"); + else if (cxl_map_component_regs(map, &dport->regs.component, + BIT(CXL_CM_CAP_CAP_ID_RAS))) + dev_dbg(dev, "Failed to map RAS capability.\n"); +} + +/** + * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * @dport: the cxl_dport that needs to be initialized + * @host: host device for devm operations + */ +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) +{ + dport->reg_map.host = host; + cxl_dport_map_ras(dport); + + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); + + if (!host_bridge->native_aer) + return; + + cxl_dport_map_rch_aer(dport); + cxl_disable_rch_root_ints(dport); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); + +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + void __iomem *addr; + u32 status; + + if (!ras_base) + return; + + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + } +} + +/* CXL spec rev3.0 8.2.4.16.1 */ +static void header_log_copy(void __iomem *ras_base, u32 *log) +{ + void __iomem *addr; + u32 *log_addr; + int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); + + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; + log_addr = log; + + for (i = 0; i < log_u32_size; i++) { + *log_addr = readl(addr); + log_addr++; + addr += sizeof(u32); + } +} + +/* + * Log the state of the RAS status registers and prepare them to log the + * next error status. Return 1 if reset needed. + */ +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + u32 hl[CXL_HEADERLOG_SIZE_U32]; + void __iomem *addr; + u32 status; + u32 fe; + + if (!ras_base) + return false; + + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) + return false; + + /* If multiple errors, log header points to first error from ctrl reg */ + if (hweight32(status) > 1) { + void __iomem *rcc_addr = + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; + + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + readl(rcc_addr))); + } else { + fe = status; + } + + header_log_copy(ras_base, hl); + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); + + return true; +} + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + + cxl_handle_cor_ras(cxlds, cxlds->regs.ras); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); + +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + bool ue; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_ras(cxlds, cxlds->regs.ras); + } + + + switch (state) { + case pci_channel_io_normal: + if (ue) { + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + } + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(&pdev->dev, + "%s: frozen state error detected, disable CXL.mem\n", + dev_name(dev)); + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(&pdev->dev, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} +EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ba17fa86d249..42a76a7a088f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -803,14 +803,6 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port, struct device *dport_dev, int port_id, resource_size_t rcrb); -#ifdef CONFIG_PCIEAER_CXL -void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport); -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); -#else -static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, - struct device *host) { } -#endif - struct cxl_decoder *to_cxl_decoder(struct device *dev); struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index cdb7cf3dbcb4..6f9c78886fd9 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -76,7 +76,23 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); + +#ifdef CONFIG_CXL_RAS void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); +#else +static inline void cxl_cor_error_detected(struct pci_dev *pdev) { } + +static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + return PCI_ERS_RESULT_NONE; +} + +static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, + struct device *host) { } +#endif + #endif /* __CXL_PCI_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0e151d0572d1..b7ea66382f3b 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -57,12 +57,12 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o -cxl_core-y += $(CXL_CORE_SRC)/ras.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o -- cgit v1.2.3 From 0ff60f2ec3e4043a442e805f80f8a2445113ec8f Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:29 -0600 Subject: cxl/pci: Move CXL driver's RCH error handling into core/ras_rch.c Restricted CXL Host (RCH) protocol error handling uses a procedure distinct from the CXL Virtual Hierarchy (VH) handling. This is because of the differences in the RCH and VH topologies. Improve the maintainability and add ability to enable/disable RCH handling. Move and combine the RCH handling code into a single block conditionally compiled with the CONFIG_CXL_RCH_RAS kernel config. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260114182055.46029-9-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 11 ++--- drivers/cxl/core/pci.c | 115 ------------------------------------------ drivers/cxl/core/ras_rch.c | 121 +++++++++++++++++++++++++++++++++++++++++++++ tools/testing/cxl/Kbuild | 1 + 5 files changed, 126 insertions(+), 123 deletions(-) create mode 100644 drivers/cxl/core/ras_rch.c (limited to 'tools') diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index b2930cc54f8b..b37f38d502d8 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o cxl_core-$(CONFIG_CXL_RAS) += ras.o +cxl_core-$(CONFIG_CXL_RAS) += ras_rch.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index bc818de87ccc..724361195057 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -149,6 +149,9 @@ int cxl_ras_init(void); void cxl_ras_exit(void); bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); #else static inline int cxl_ras_init(void) { @@ -164,14 +167,6 @@ static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras return false; } static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } -#endif /* CONFIG_CXL_RAS */ - -/* Restricted CXL Host specific RAS functions */ -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport); -void cxl_disable_rch_root_ints(struct cxl_dport *dport); -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); -#else static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index e132fff80979..b838c59d7a3c 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,121 +632,6 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport) -{ - resource_size_t aer_phys; - struct device *host; - u16 aer_cap; - - aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); - if (aer_cap) { - host = dport->reg_map.host; - aer_phys = aer_cap + dport->rcrb.base; - dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys, - sizeof(struct aer_capability_regs)); - } -} - -void cxl_disable_rch_root_ints(struct cxl_dport *dport) -{ - void __iomem *aer_base = dport->regs.dport_aer; - u32 aer_cmd_mask, aer_cmd; - - if (!aer_base) - return; - - /* - * Disable RCH root port command interrupts. - * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors - * - * This sequence may not be necessary. CXL spec states disabling - * the root cmd register's interrupts is required. But, PCI spec - * shows these are disabled by default on reset. - */ - aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | - PCI_ERR_ROOT_CMD_NONFATAL_EN | - PCI_ERR_ROOT_CMD_FATAL_EN); - aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); - aer_cmd &= ~aer_cmd_mask; - writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); -} - -/* - * Copy the AER capability registers using 32 bit read accesses. - * This is necessary because RCRB AER capability is MMIO mapped. Clear the - * status after copying. - * - * @aer_base: base address of AER capability block in RCRB - * @aer_regs: destination for copying AER capability - */ -static bool cxl_rch_get_aer_info(void __iomem *aer_base, - struct aer_capability_regs *aer_regs) -{ - int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); - u32 *aer_regs_buf = (u32 *)aer_regs; - int n; - - if (!aer_base) - return false; - - /* Use readl() to guarantee 32-bit accesses */ - for (n = 0; n < read_cnt; n++) - aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); - - writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); - writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); - - return true; -} - -/* Get AER severity. Return false if there is no error. */ -static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, - int *severity) -{ - if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { - if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) - *severity = AER_FATAL; - else - *severity = AER_NONFATAL; - return true; - } - - if (aer_regs->cor_status & ~aer_regs->cor_mask) { - *severity = AER_CORRECTABLE; - return true; - } - - return false; -} - -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) -{ - struct pci_dev *pdev = to_pci_dev(cxlds->dev); - struct aer_capability_regs aer_regs; - struct cxl_dport *dport; - int severity; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return; - - if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) - return; - - if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) - return; - - pci_print_aer(pdev, severity, &aer_regs); - - if (severity == AER_CORRECTABLE) - cxl_handle_cor_ras(cxlds, dport->regs.ras); - else - cxl_handle_ras(cxlds, dport->regs.ras); -} -#endif - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c new file mode 100644 index 000000000000..ed58afd18ecc --- /dev/null +++ b/drivers/cxl/core/ras_rch.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include +#include +#include "cxl.h" +#include "core.h" +#include "cxlmem.h" + +void cxl_dport_map_rch_aer(struct cxl_dport *dport) +{ + resource_size_t aer_phys; + struct device *host; + u16 aer_cap; + + aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); + if (aer_cap) { + host = dport->reg_map.host; + aer_phys = aer_cap + dport->rcrb.base; + dport->regs.dport_aer = + devm_cxl_iomap_block(host, aer_phys, + sizeof(struct aer_capability_regs)); + } +} + +void cxl_disable_rch_root_ints(struct cxl_dport *dport) +{ + void __iomem *aer_base = dport->regs.dport_aer; + u32 aer_cmd_mask, aer_cmd; + + if (!aer_base) + return; + + /* + * Disable RCH root port command interrupts. + * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors + * + * This sequence may not be necessary. CXL spec states disabling + * the root cmd register's interrupts is required. But, PCI spec + * shows these are disabled by default on reset. + */ + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | + PCI_ERR_ROOT_CMD_NONFATAL_EN | + PCI_ERR_ROOT_CMD_FATAL_EN); + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); + aer_cmd &= ~aer_cmd_mask; + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); +} + +/* + * Copy the AER capability registers using 32 bit read accesses. + * This is necessary because RCRB AER capability is MMIO mapped. Clear the + * status after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + /* Use readl() to guarantee 32-bit accesses */ + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + struct cxl_dport *dport; + int severity; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return; + + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + pci_print_aer(pdev, severity, &aer_regs); + if (severity == AER_CORRECTABLE) + cxl_handle_cor_ras(cxlds, dport->regs.ras); + else + cxl_handle_ras(cxlds, dport->regs.ras); +} diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index b7ea66382f3b..6eceefefb0e0 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras_rch.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o -- cgit v1.2.3 From a081b5789255d27b76cd2cbab85676b2a31dbde1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 16 Jan 2026 10:34:02 +0100 Subject: kallsyms: Get rid of kallsyms relative base When the kallsyms relative base was introduced, per-CPU variable references on x86_64 SMP were implemented as offsets into the respective per-CPU region, rather than offsets relative to the location of the variable's template in the kernel image, which is how other architectures implement it. This required kallsyms to reason about the difference between the two, and the sign of the value in the kallsyms_offsets[] array was used to distinguish them. This meant that negative offsets were not permitted for ordinary variables, and so it was crucial that the relative base was chosen such that all offsets were positive numbers. This is no longer needed: instead, the offsets can simply be encoded as values in the range -/+ 2 GiB, which is precisely what PC32 relocations provide on most architectures. So it is possible to simplify the logic, and just use _text as the anchor directly, and let the linker calculate the final value based on the location of the entry itself. Some architectures (nios2, extensa) do not support place-relative relocations at all, but these are all 32-bit and non-relocatable, and so there is no need for place-relative relocations in the first place, and the actual symbol values can just be stored directly. This makes all entries in the kallsyms_offsets[] array visible as place-relative references in the ELF metadata, which will be important when implementing ELF-based fg-kaslr. Reviewed-by: Kees Cook Signed-off-by: Ard Biesheuvel Link: https://patch.msgid.link/20260116093359.2442297-6-ardb+git@google.com Signed-off-by: Nathan Chancellor --- kernel/kallsyms.c | 6 ++-- kernel/kallsyms_internal.h | 1 - kernel/vmcore_info.c | 1 - scripts/kallsyms.c | 64 ++++++++++--------------------------- scripts/link-vmlinux.sh | 4 +++ tools/perf/tests/vmlinux-kallsyms.c | 1 - 6 files changed, 25 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 049e296f586c..6125724aadb1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -151,8 +151,10 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets */ - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + /* non-relocatable 32-bit kernels just embed the value directly */ + if (!IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_RELOCATABLE)) + return (u32)kallsyms_offsets[idx]; + return (unsigned long)offset_to_ptr(kallsyms_offsets + idx); } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 9633782f8250..81a867dbe57d 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -8,7 +8,6 @@ extern const int kallsyms_offsets[]; extern const u8 kallsyms_names[]; extern const unsigned int kallsyms_num_syms; -extern const unsigned long kallsyms_relative_base; extern const char kallsyms_token_table[]; extern const u16 kallsyms_token_index[]; diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index fe9bf8db1922..f114719f6cb5 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -238,7 +238,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); VMCOREINFO_SYMBOL(kallsyms_offsets); - VMCOREINFO_SYMBOL(kallsyms_relative_base); #endif /* CONFIG_KALLSYMS */ arch_crash_save_vmcoreinfo(); diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 4b0234e4b12f..37d5c095ad22 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -46,7 +46,6 @@ struct addr_range { }; static unsigned long long _text; -static unsigned long long relative_base; static struct addr_range text_ranges[] = { { "_stext", "_etext" }, { "_sinittext", "_einittext" }, @@ -57,6 +56,7 @@ static struct addr_range text_ranges[] = { static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; +static int pc_relative; static int token_profit[0x10000]; @@ -280,7 +280,7 @@ static void read_map(const char *in) static void output_label(const char *label) { printf(".globl %s\n", label); - printf("\tALGN\n"); + printf("\t.balign 4\n"); printf("%s:\n", label); } @@ -343,15 +343,6 @@ static void write_src(void) unsigned int *markers, markers_cnt; char buf[KSYM_NAME_LEN]; - printf("#include \n"); - printf("#if BITS_PER_LONG == 64\n"); - printf("#define PTR .quad\n"); - printf("#define ALGN .balign 8\n"); - printf("#else\n"); - printf("#define PTR .long\n"); - printf("#define ALGN .balign 4\n"); - printf("#endif\n"); - printf("\t.section .rodata, \"a\"\n"); output_label("kallsyms_num_syms"); @@ -434,34 +425,24 @@ static void write_src(void) output_label("kallsyms_offsets"); for (i = 0; i < table_cnt; i++) { - /* - * Use the offset relative to the lowest value - * encountered of all relative symbols, and emit - * non-relocatable fixed offsets that will be fixed - * up at runtime. - */ - - long long offset; - - offset = table[i]->addr - relative_base; - if (offset < 0 || offset > UINT_MAX) { - fprintf(stderr, "kallsyms failure: " - "relative symbol value %#llx out of range\n", - table[i]->addr); - exit(EXIT_FAILURE); + if (pc_relative) { + long long offset = table[i]->addr - _text; + + if (offset < INT_MIN || offset > INT_MAX) { + fprintf(stderr, "kallsyms failure: " + "relative symbol value %#llx out of range\n", + table[i]->addr); + exit(EXIT_FAILURE); + } + printf("\t.long\t_text - . + (%d)\t/* %s */\n", + (int)offset, table[i]->sym); + } else { + printf("\t.long\t%#x\t/* %s */\n", + (unsigned int)table[i]->addr, table[i]->sym); } - printf("\t.long\t%#x\t/* %s */\n", (int)offset, table[i]->sym); } printf("\n"); - output_label("kallsyms_relative_base"); - /* Provide proper symbols relocatability by their '_text' relativeness. */ - if (_text <= relative_base) - printf("\tPTR\t_text + %#llx\n", relative_base - _text); - else - printf("\tPTR\t_text - %#llx\n", _text - relative_base); - printf("\n"); - sort_symbols_by_name(); output_label("kallsyms_seqs_of_names"); for (i = 0; i < table_cnt; i++) @@ -701,22 +682,12 @@ static void sort_symbols(void) qsort(table, table_cnt, sizeof(table[0]), compare_symbols); } -/* find the minimum non-absolute symbol address */ -static void record_relative_base(void) -{ - /* - * The table is sorted by address. - * Take the first symbol value. - */ - if (table_cnt) - relative_base = table[0]->addr; -} - int main(int argc, char **argv) { while (1) { static const struct option long_options[] = { {"all-symbols", no_argument, &all_symbols, 1}, + {"pc-relative", no_argument, &pc_relative, 1}, {}, }; @@ -734,7 +705,6 @@ int main(int argc, char **argv) read_map(argv[optind]); shrink_table(); sort_symbols(); - record_relative_base(); optimize_token_table(); write_src(); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4ab44c73da4d..73531cb63efc 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -143,6 +143,10 @@ kallsyms() kallsymopt="${kallsymopt} --all-symbols" fi + if is_enabled CONFIG_64BIT || is_enabled CONFIG_RELOCATABLE; then + kallsymopt="${kallsymopt} --pc-relative" + fi + info KSYMS "${2}.S" scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S" diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 74cdbd2ce9d0..524d46478364 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -27,7 +27,6 @@ static bool is_ignored_symbol(const char *name, char type) * stable symbol list. */ "kallsyms_offsets", - "kallsyms_relative_base", "kallsyms_num_syms", "kallsyms_names", "kallsyms_markers", -- cgit v1.2.3 From d07d7c3dd9446b47507d15fdfbb835638a2f6f50 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 21 Jan 2026 13:46:44 +0200 Subject: selftests: net: Add kernel selftest for RFC 4884 RFC 4884 extended certain ICMP messages with a length attribute that encodes the length of the "original datagram" field. This is needed so that new information could be appended to these messages without applications thinking that it is part of the "original datagram" field. In version 5.9, the kernel was extended with two new socket options (SOL_IP/IP_RECVERR_4884 and SOL_IPV6/IPV6_RECVERR_RFC4884) that allow user space to retrieve this length which is basically the offset to the ICMP Extension Structure at the end of the ICMP message. This is required by user space applications that need to parse the information contained in the ICMP Extension Structure. For example, the RFC 5837 extension for tracepath. Add a selftest that verifies correct handling of the RFC 4884 length field for both IPv4 and IPv6, with and without extension structures, and validates that malformed extensions are correctly reported as invalid. For each address family, the test creates: - a raw socket used to send locally crafted ICMP error packets to the loopback address, and - a datagram socket used to receive the encapsulated original datagram and associated error metadata from the kernel error queue. ICMP packets are constructed entirely in user space rather than relying on kernel-generated errors. This allows the test to exercise invalid scenarios (such as corrupted checksums and incorrect length fields) and verify that the SO_EE_RFC4884_FLAG_INVALID flag is set as expected. Output Example: $ ./icmp_rfc4884 Starting 18 tests from 18 test cases. RUN rfc4884.ipv4_ext_small_payload.rfc4884 ... OK rfc4884.ipv4_ext_small_payload.rfc4884 ok 1 rfc4884.ipv4_ext_small_payload.rfc4884 RUN rfc4884.ipv4_ext.rfc4884 ... OK rfc4884.ipv4_ext.rfc4884 ok 2 rfc4884.ipv4_ext.rfc4884 RUN rfc4884.ipv4_ext_large_payload.rfc4884 ... OK rfc4884.ipv4_ext_large_payload.rfc4884 ok 3 rfc4884.ipv4_ext_large_payload.rfc4884 RUN rfc4884.ipv4_no_ext_small_payload.rfc4884 ... OK rfc4884.ipv4_no_ext_small_payload.rfc4884 ok 4 rfc4884.ipv4_no_ext_small_payload.rfc4884 RUN rfc4884.ipv4_no_ext_min_payload.rfc4884 ... OK rfc4884.ipv4_no_ext_min_payload.rfc4884 ok 5 rfc4884.ipv4_no_ext_min_payload.rfc4884 RUN rfc4884.ipv4_no_ext_large_payload.rfc4884 ... OK rfc4884.ipv4_no_ext_large_payload.rfc4884 ok 6 rfc4884.ipv4_no_ext_large_payload.rfc4884 RUN rfc4884.ipv4_invalid_ext_checksum.rfc4884 ... OK rfc4884.ipv4_invalid_ext_checksum.rfc4884 ok 7 rfc4884.ipv4_invalid_ext_checksum.rfc4884 RUN rfc4884.ipv4_invalid_ext_length_small.rfc4884 ... OK rfc4884.ipv4_invalid_ext_length_small.rfc4884 ok 8 rfc4884.ipv4_invalid_ext_length_small.rfc4884 RUN rfc4884.ipv4_invalid_ext_length_large.rfc4884 ... OK rfc4884.ipv4_invalid_ext_length_large.rfc4884 ok 9 rfc4884.ipv4_invalid_ext_length_large.rfc4884 RUN rfc4884.ipv6_ext_small_payload.rfc4884 ... OK rfc4884.ipv6_ext_small_payload.rfc4884 ok 10 rfc4884.ipv6_ext_small_payload.rfc4884 RUN rfc4884.ipv6_ext.rfc4884 ... OK rfc4884.ipv6_ext.rfc4884 ok 11 rfc4884.ipv6_ext.rfc4884 RUN rfc4884.ipv6_ext_large_payload.rfc4884 ... OK rfc4884.ipv6_ext_large_payload.rfc4884 ok 12 rfc4884.ipv6_ext_large_payload.rfc4884 RUN rfc4884.ipv6_no_ext_small_payload.rfc4884 ... OK rfc4884.ipv6_no_ext_small_payload.rfc4884 ok 13 rfc4884.ipv6_no_ext_small_payload.rfc4884 RUN rfc4884.ipv6_no_ext_min_payload.rfc4884 ... OK rfc4884.ipv6_no_ext_min_payload.rfc4884 ok 14 rfc4884.ipv6_no_ext_min_payload.rfc4884 RUN rfc4884.ipv6_no_ext_large_payload.rfc4884 ... OK rfc4884.ipv6_no_ext_large_payload.rfc4884 ok 15 rfc4884.ipv6_no_ext_large_payload.rfc4884 RUN rfc4884.ipv6_invalid_ext_checksum.rfc4884 ... OK rfc4884.ipv6_invalid_ext_checksum.rfc4884 ok 16 rfc4884.ipv6_invalid_ext_checksum.rfc4884 RUN rfc4884.ipv6_invalid_ext_length_small.rfc4884 ... OK rfc4884.ipv6_invalid_ext_length_small.rfc4884 ok 17 rfc4884.ipv6_invalid_ext_length_small.rfc4884 RUN rfc4884.ipv6_invalid_ext_length_large.rfc4884 ... OK rfc4884.ipv6_invalid_ext_length_large.rfc4884 ok 18 rfc4884.ipv6_invalid_ext_length_large.rfc4884 PASSED: 18 / 18 tests passed. Totals: pass:18 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260121114644.2863640-1-danieller@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/icmp_rfc4884.c | 679 +++++++++++++++++++++++++++++ 3 files changed, 681 insertions(+) create mode 100644 tools/testing/selftests/net/icmp_rfc4884.c (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 6930fe926c58..97ad4d551d44 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -7,6 +7,7 @@ cmsg_sender epoll_busy_poll fin_ack_lat hwtstamp_config +icmp_rfc4884 io_uring_zerocopy_tx ioam6_parser ip_defrag diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b66ba04f19d9..fe7937dc5f45 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -166,6 +166,7 @@ TEST_GEN_PROGS := \ bind_timewait \ bind_wildcard \ epoll_busy_poll \ + icmp_rfc4884 \ ipv6_fragmentation \ proc_net_pktgen \ reuseaddr_conflict \ diff --git a/tools/testing/selftests/net/icmp_rfc4884.c b/tools/testing/selftests/net/icmp_rfc4884.c new file mode 100644 index 000000000000..cd826b913557 --- /dev/null +++ b/tools/testing/selftests/net/icmp_rfc4884.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +static const unsigned short src_port = 44444; +static const unsigned short dst_port = 55555; +static const int min_orig_dgram_len = 128; +static const int min_payload_len_v4 = + min_orig_dgram_len - sizeof(struct iphdr) - sizeof(struct udphdr); +static const int min_payload_len_v6 = + min_orig_dgram_len - sizeof(struct ipv6hdr) - sizeof(struct udphdr); +static const uint8_t orig_payload_byte = 0xAA; + +struct sockaddr_inet { + union { + struct sockaddr_in6 v6; + struct sockaddr_in v4; + struct sockaddr sa; + }; + socklen_t len; +}; + +struct ip_case_info { + int domain; + int level; + int opt1; + int opt2; + int proto; + int (*build_func)(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len); + int min_payload; +}; + +static int bringup_loopback(void) +{ + struct ifreq ifr = { + .ifr_name = "lo" + }; + int fd; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -1; + + if (ioctl(fd, SIOCGIFFLAGS, &ifr) < 0) + goto err; + + ifr.ifr_flags = ifr.ifr_flags | IFF_UP; + + if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) + goto err; + + close(fd); + return 0; + +err: + close(fd); + return -1; +} + +static uint16_t csum(const void *buf, size_t len) +{ + const uint8_t *data = buf; + uint32_t sum = 0; + + while (len > 1) { + sum += (data[0] << 8) | data[1]; + data += 2; + len -= 2; + } + + if (len == 1) + sum += data[0] << 8; + + while (sum >> 16) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum & 0xFFFF; +} + +static int poll_err(int fd) +{ + struct pollfd pfd; + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = fd; + + if (poll(&pfd, 1, 5000) != 1 || pfd.revents != POLLERR) + return -1; + + return 0; +} + +static void set_addr(struct sockaddr_inet *addr, int domain, + unsigned short port) +{ + memset(addr, 0, sizeof(*addr)); + + switch (domain) { + case AF_INET: + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = htons(port); + addr->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr->len = sizeof(addr->v4); + break; + case AF_INET6: + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = htons(port); + addr->v6.sin6_addr = in6addr_loopback; + addr->len = sizeof(addr->v6); + break; + } +} + +static int bind_and_setsockopt(int fd, const struct ip_case_info *info) +{ + struct sockaddr_inet addr; + int opt = 1; + + set_addr(&addr, info->domain, src_port); + + if (setsockopt(fd, info->level, info->opt1, &opt, sizeof(opt)) < 0) + return -1; + + if (setsockopt(fd, info->level, info->opt2, &opt, sizeof(opt)) < 0) + return -1; + + return bind(fd, &addr.sa, addr.len); +} + +static int build_rfc4884_ext(uint8_t *buf, size_t buflen, bool bad_csum, + bool bad_len, bool smaller_len) +{ + struct icmp_extobj_hdr *objh; + struct icmp_ext_hdr *exthdr; + size_t obj_len, ext_len; + uint16_t sum; + + /* Use an object payload of 4 bytes */ + obj_len = sizeof(*objh) + sizeof(uint32_t); + ext_len = sizeof(*exthdr) + obj_len; + + if (ext_len > buflen) + return -EINVAL; + + exthdr = (struct icmp_ext_hdr *)buf; + objh = (struct icmp_extobj_hdr *)(buf + sizeof(*exthdr)); + + exthdr->version = 2; + /* When encoding a bad object length, either encode a length too small + * to fit the object header or too big to fit in the packet. + */ + if (bad_len) + obj_len = smaller_len ? sizeof(*objh) - 1 : obj_len * 2; + objh->length = htons(obj_len); + + sum = csum(buf, ext_len); + exthdr->checksum = htons(bad_csum ? sum - 1 : sum); + + return ext_len; +} + +static int build_orig_dgram_v4(uint8_t *buf, ssize_t buflen, int payload_len) +{ + struct udphdr *udph; + struct iphdr *iph; + size_t len = 0; + + len = sizeof(*iph) + sizeof(*udph) + payload_len; + if (len > buflen) + return -EINVAL; + + iph = (struct iphdr *)buf; + udph = (struct udphdr *)(buf + sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->protocol = IPPROTO_UDP; + iph->saddr = htonl(INADDR_LOOPBACK); + iph->daddr = htonl(INADDR_LOOPBACK); + iph->tot_len = htons(len); + iph->check = htons(csum(iph, sizeof(*iph))); + + udph->source = htons(src_port); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + payload_len); + + memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte, + payload_len); + + return len; +} + +static int build_orig_dgram_v6(uint8_t *buf, ssize_t buflen, int payload_len) +{ + struct udphdr *udph; + struct ipv6hdr *iph; + size_t len = 0; + + len = sizeof(*iph) + sizeof(*udph) + payload_len; + if (len > buflen) + return -EINVAL; + + iph = (struct ipv6hdr *)buf; + udph = (struct udphdr *)(buf + sizeof(*iph)); + + iph->version = 6; + iph->payload_len = htons(sizeof(*udph) + payload_len); + iph->nexthdr = IPPROTO_UDP; + iph->saddr = in6addr_loopback; + iph->daddr = in6addr_loopback; + + udph->source = htons(src_port); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + payload_len); + + memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte, + payload_len); + + return len; +} + +static int build_icmpv4_pkt(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len) +{ + struct icmphdr *icmph; + int len, ret; + + len = sizeof(*icmph); + memset(buf, 0, buflen); + + icmph = (struct icmphdr *)buf; + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_PORT_UNREACH; + icmph->checksum = 0; + + ret = build_orig_dgram_v4(buf + len, buflen - len, payload_len); + if (ret < 0) + return ret; + + len += ret; + + icmph->un.reserved[1] = (len - sizeof(*icmph)) / sizeof(uint32_t); + + if (with_ext) { + ret = build_rfc4884_ext(buf + len, buflen - len, + bad_csum, bad_len, smaller_len); + if (ret < 0) + return ret; + + len += ret; + } + + icmph->checksum = htons(csum(icmph, len)); + return len; +} + +static int build_icmpv6_pkt(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len) +{ + struct icmp6hdr *icmph; + int len, ret; + + len = sizeof(*icmph); + memset(buf, 0, buflen); + + icmph = (struct icmp6hdr *)buf; + icmph->icmp6_type = ICMPV6_DEST_UNREACH; + icmph->icmp6_code = ICMPV6_PORT_UNREACH; + icmph->icmp6_cksum = 0; + + ret = build_orig_dgram_v6(buf + len, buflen - len, payload_len); + if (ret < 0) + return ret; + + len += ret; + + icmph->icmp6_datagram_len = (len - sizeof(*icmph)) / sizeof(uint64_t); + + if (with_ext) { + ret = build_rfc4884_ext(buf + len, buflen - len, + bad_csum, bad_len, smaller_len); + if (ret < 0) + return ret; + + len += ret; + } + + icmph->icmp6_cksum = htons(csum(icmph, len)); + return len; +} + +FIXTURE(rfc4884) {}; + +FIXTURE_SETUP(rfc4884) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(ret, 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + } + + ret = bringup_loopback(); + ASSERT_EQ(ret, 0) TH_LOG("Failed to bring up loopback interface"); +} + +FIXTURE_TEARDOWN(rfc4884) +{ +} + +const struct ip_case_info ipv4_info = { + .domain = AF_INET, + .level = SOL_IP, + .opt1 = IP_RECVERR, + .opt2 = IP_RECVERR_RFC4884, + .proto = IPPROTO_ICMP, + .build_func = build_icmpv4_pkt, + .min_payload = min_payload_len_v4, +}; + +const struct ip_case_info ipv6_info = { + .domain = AF_INET6, + .level = SOL_IPV6, + .opt1 = IPV6_RECVERR, + .opt2 = IPV6_RECVERR_RFC4884, + .proto = IPPROTO_ICMPV6, + .build_func = build_icmpv6_pkt, + .min_payload = min_payload_len_v6, +}; + +FIXTURE_VARIANT(rfc4884) { + /* IPv4/v6 related information */ + struct ip_case_info info; + /* Whether to append an ICMP extension or not */ + bool with_ext; + /* UDP payload length */ + int payload_len; + /* Whether to generate a bad checksum in the ICMP extension structure */ + bool bad_csum; + /* Whether to generate a bad length in the ICMP object header */ + bool bad_len; + /* Whether it is too small to fit the object header or too big to fit + * in the packet + */ + bool smaller_len; +}; + +/* Tests that a valid ICMPv4 error message with extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_small_payload) { + .info = ipv4_info, + .with_ext = true, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message with extension and 128 bytes original + * datagram, generates an error with the expected offset, and does not raise the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message with extension and the original + * datagram is larger than 128 bytes, generates an error with the expected + * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_large_payload) { + .info = ipv4_info, + .with_ext = true, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_small_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and 128 bytes + * original datagram, generates an error with zero offset, and does not raise + * the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_min_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and the original + * datagram is larger than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_large_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that an ICMPv4 error message with extension and an invalid checksum, + * generates an error with the expected offset, and raises the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_checksum) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = true, + .bad_len = false, +}; + +/* Tests that an ICMPv4 error message with extension and an object length + * smaller than the object header, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_small) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = true, + .smaller_len = true, +}; + +/* Tests that an ICMPv4 error message with extension and an object length that + * is too big to fit in the packet, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_large) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = true, + .smaller_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_small_payload) { + .info = ipv6_info, + .with_ext = true, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and 128 bytes original + * datagram, generates an error with the expected offset, and does not raise the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and the original + * datagram is larger than 128 bytes, generates an error with the expected + * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_large_payload) { + .info = ipv6_info, + .with_ext = true, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; +/* Tests that a valid ICMPv6 error message without extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_small_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message without extension and 128 bytes + * original datagram, generates an error with zero offset, and does not + * raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_min_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message without extension and the original + * datagram is larger than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_large_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that an ICMPv6 error message with extension and an invalid checksum, + * generates an error with the expected offset, and raises the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_checksum) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = true, + .bad_len = false, +}; + +/* Tests that an ICMPv6 error message with extension and an object length + * smaller than the object header, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_small) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = true, + .smaller_len = true, +}; + +/* Tests that an ICMPv6 error message with extension and an object length that + * is too big to fit in the packet, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_large) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = true, + .smaller_len = false, +}; + +static void +check_rfc4884_offset(struct __test_metadata *_metadata, int sock, + const FIXTURE_VARIANT(rfc4884) *v) +{ + char rxbuf[1024]; + char ctrl[1024]; + struct iovec iov = { + .iov_base = rxbuf, + .iov_len = sizeof(rxbuf) + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = ctrl, + .msg_controllen = sizeof(ctrl), + }; + struct cmsghdr *cmsg; + int recv; + + ASSERT_EQ(poll_err(sock), 0); + + recv = recvmsg(sock, &msg, MSG_ERRQUEUE); + ASSERT_GE(recv, 0) TH_LOG("recvmsg(MSG_ERRQUEUE) failed"); + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + bool is_invalid, expected_invalid; + struct sock_extended_err *ee; + int expected_off; + uint16_t off; + + if (cmsg->cmsg_level != v->info.level || + cmsg->cmsg_type != v->info.opt1) { + TH_LOG("Unrelated cmsgs were encountered in recvmsg()"); + continue; + } + + ee = (struct sock_extended_err *)CMSG_DATA(cmsg); + off = ee->ee_rfc4884.len; + is_invalid = ee->ee_rfc4884.flags & SO_EE_RFC4884_FLAG_INVALID; + + expected_invalid = v->bad_csum || v->bad_len; + ASSERT_EQ(is_invalid, expected_invalid) { + TH_LOG("Expected invalidity flag to be %d, but got %d", + expected_invalid, is_invalid); + } + + expected_off = + (v->with_ext && v->payload_len >= v->info.min_payload) ? + v->payload_len : 0; + ASSERT_EQ(off, expected_off) { + TH_LOG("Expected RFC4884 offset %u, got %u", + expected_off, off); + } + break; + } +} + +TEST_F(rfc4884, rfc4884) +{ + const typeof(variant) v = variant; + struct sockaddr_inet addr; + uint8_t pkt[1024]; + int dgram, raw; + int len, sent; + int err; + + dgram = socket(v->info.domain, SOCK_DGRAM, 0); + ASSERT_GE(dgram, 0) TH_LOG("Opening datagram socket failed"); + + err = bind_and_setsockopt(dgram, &v->info); + ASSERT_EQ(err, 0) TH_LOG("Bind failed"); + + raw = socket(v->info.domain, SOCK_RAW, v->info.proto); + ASSERT_GE(raw, 0) TH_LOG("Opening raw socket failed"); + + len = v->info.build_func(pkt, sizeof(pkt), v->with_ext, v->payload_len, + v->bad_csum, v->bad_len, v->smaller_len); + ASSERT_GT(len, 0) TH_LOG("Building packet failed"); + + set_addr(&addr, v->info.domain, 0); + sent = sendto(raw, pkt, len, 0, &addr.sa, addr.len); + ASSERT_EQ(len, sent) TH_LOG("Sending packet failed"); + + check_rfc4884_offset(_metadata, dgram, v); + + close(dgram); + close(raw); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From caf84294ff98bb7455722285f30f46c193ffccdd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:48 +0800 Subject: selftests: ublk: fix user_data truncation for tgt_data >= 256 The build_user_data() function packs multiple fields into a __u64 value using bit shifts. Without explicit __u64 casts before shifting, the shift operations are performed on 32-bit unsigned integers before being promoted to 64-bit, causing data loss. Specifically, when tgt_data >= 256, the expression (tgt_data << 24) shifts on a 32-bit value, truncating the upper 8 bits before promotion to __u64. Since tgt_data can be up to 16 bits (assertion allows up to 65535), values >= 256 would have their high byte lost. Add explicit __u64 casts to both op and tgt_data before shifting to ensure the shift operations happen in 64-bit space, preserving all bits of the input values. user_data_to_tgt_data() is only used by stripe.c, in which the max supported member disks are 4, so won't trigger this issue. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index cb757fd9bf9d..69fd5794f300 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -262,7 +262,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } -- cgit v1.2.3 From 584709ad5ce359f8b5773eb6af40070412652c51 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:49 +0800 Subject: selftests: ublk: replace assert() with ublk_assert() Replace assert() with ublk_assert() since it is often triggered in daemon, and we may get nothing shown in terminal. Add ublk_assert(), so we can log something to syslog when assert() is triggered. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/common.c | 2 +- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 10 +++++----- tools/testing/selftests/ublk/utils.h | 10 ++++++++++ 6 files changed, 19 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index d9873d4d50d0..530f9877c9dd 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -16,7 +16,7 @@ int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c3ce5ff72422..889047bd8fa3 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3472ce7426ba..e98999bea9b1 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -825,7 +825,7 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (ublk_queue_use_user_copy(q)) ublk_user_copy(io, UBLK_IO_OP_WRITE); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 69fd5794f300..48634d29c084 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -260,7 +260,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, { /* we only have 7 bits to encode q_id */ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 2be1c36438e7..b967447fe591 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -322,7 +322,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e..17eefed73690 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -43,6 +43,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +53,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +64,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif -- cgit v1.2.3 From f1d621b5a04ea41ee90f177db084d00db57e6839 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:50 +0800 Subject: selftests: ublk: add ublk_io_buf_idx() for returning io buffer index Since UBLK_F_PER_IO_DAEMON is added, io buffer index may depend on current thread because the common way is to use per-pthread io_ring_ctx for issuing ublk uring_cmd. Add one helper for returning io buffer index, so we can hide the buffer index implementation details for target code. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 9 +++++---- tools/testing/selftests/ublk/kublk.c | 9 +++++---- tools/testing/selftests/ublk/kublk.h | 10 +++++++++- tools/testing/selftests/ublk/null.c | 18 ++++++++++-------- tools/testing/selftests/ublk/stripe.c | 7 ++++--- 5 files changed, 33 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 889047bd8fa3..228af2580ac6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -39,6 +39,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + unsigned short buf_index = ublk_io_buf_idx(t, q, tag); if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); @@ -62,7 +63,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, len, offset); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_index; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -71,7 +72,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -79,11 +80,11 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, len, offset); - sqe[1]->buf_index = tag; + sqe[1]->buf_index = buf_index; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e98999bea9b1..9b6f1cd04dc4 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -605,16 +605,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -730,7 +731,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 48634d29c084..311a75da9b21 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -150,7 +150,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -393,6 +394,13 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) +{ + return q->ios[tag].buf_index; +} + static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { return &q->ios[tag]; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 3aa162f08476..7656888f4149 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -44,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -61,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -86,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -137,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index b967447fe591..dca819f5366e 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = io->buf_addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } -- cgit v1.2.3 From dccbfa9d416424fbcbc83a46e84c604bad1db9d0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:51 +0800 Subject: selftests: ublk: add batch buffer management infrastructure Add the foundational infrastructure for UBLK_F_BATCH_IO buffer management including: - Allocator utility functions for small sized per-thread allocation - Batch buffer allocation and deallocation functions - Buffer index management for commit buffers - Thread state management for batch I/O mode - Buffer size calculation based on device features This prepares the groundwork for handling batch I/O commands by establishing the buffer management layer needed for UBLK_U_IO_PREP_IO_CMDS and UBLK_U_IO_COMMIT_IO_CMDS operations. The allocator uses CPU sets for efficient per-thread buffer tracking, and commit buffers are pre-allocated with 2 buffers per thread to handle overlapping command operations. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 152 +++++++++++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 26 +++++- tools/testing/selftests/ublk/kublk.h | 53 ++++++++++++ tools/testing/selftests/ublk/utils.h | 54 +++++++++++++ 4 files changed, 282 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/ublk/batch.c (limited to 'tools') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 000000000000..609e6073c9c0 --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + if (t->commit_buf) { + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + + munlock(t->commit_buf, total); + free(t->commit_buf); + } + allocator_deinit(&t->commit_buf_alloc); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int ret; + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + + /* lock commit buffer pages for fast access */ + if (mlock(t->commit_buf, total)) + ublk_err("%s: can't lock commit buffer %s\n", __func__, + strerror(errno)); + + return 0; + +fail: + free_batch_commit_buf(t); + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + ublk_assert(t->nr_commit_buf < 16); + return alloc_batch_commit_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 9b6f1cd04dc4..3864f42e6c29 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -435,6 +435,8 @@ static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -531,15 +533,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 311a75da9b21..424c333596ac 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -182,15 +182,40 @@ struct ublk_queue { struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; unsigned int cmd_inflight; unsigned int io_inflight; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; + struct io_uring ring; }; @@ -211,6 +236,27 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, struct ublk_params *params) { @@ -465,6 +511,13 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index 17eefed73690..aab522f26167 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) -- cgit v1.2.3 From d468930a019df71951a80fde20f6348136a2175d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:52 +0800 Subject: selftests: ublk: handle UBLK_U_IO_PREP_IO_CMDS Implement support for UBLK_U_IO_PREP_IO_CMDS in the batch I/O framework: - Add batch command initialization and setup functions - Implement prep command queueing with proper buffer management - Add command completion handling for prep and commit commands - Integrate batch I/O setup into thread initialization - Update CQE handling to support batch commands The implementation uses the previously established buffer management infrastructure to queue UBLK_U_IO_PREP_IO_CMDS commands. Commands are prepared in the first thread context and use commit buffers for efficient command batching. Key changes: - ublk_batch_queue_prep_io_cmds() prepares I/O command batches - ublk_batch_compl_cmd() handles batch command completions - Modified thread setup to use batch operations when enabled - Enhanced buffer index calculation for batch mode Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 114 +++++++++++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 50 +++++++++++---- tools/testing/selftests/ublk/kublk.h | 22 +++++++ 3 files changed, 174 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 609e6073c9c0..079cae77add1 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -150,3 +150,117 @@ void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); } + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + /* Use plain user buffer instead of fixed buffer */ + cmd->flags |= t->cmd_flags; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) + ;//assert(cqe->res == t->commit_buf_size); + else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + t->cmd_inflight--; + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3864f42e6c29..dba912a44eb3 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -840,6 +840,8 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, unsigned tag = user_data_to_tag(cqe->user_data); struct ublk_io *io = &q->ios[tag]; + t->cmd_inflight--; + if (!fetch) { t->state |= UBLKS_T_STOPPING; io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; @@ -874,28 +876,30 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } - t->cmd_inflight--; - - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -952,6 +956,22 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) info->dev->dev_info.dev_id, info->idx); } +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + /* setup all queues in the 1st thread */ + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret == 0); + ret = ublk_process_io(t); + ublk_assert(ret >= 0); + } +} + static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) { struct ublk_thread t = { @@ -972,8 +992,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t.idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(&t); + if (!ublk_thread_batch_io(&t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + } else if (!t.idx) { + /* prepare all io commands in the 1st thread context */ + ublk_batch_setup_queues(&t); + } + do { if (ublk_process_io(&t) < 0) break; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 424c333596ac..08320d44c7c2 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -440,10 +440,16 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); + static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); return q->ios[tag].buf_index; } @@ -511,6 +517,22 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); /* Initialize batch I/O state and calculate buffer parameters */ void ublk_batch_prepare(struct ublk_thread *t); /* Allocate and register commit buffers for batch operations */ -- cgit v1.2.3 From dee7024ffecba291891503e425373d9f2a1d01b6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:53 +0800 Subject: selftests: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Implement UBLK_U_IO_COMMIT_IO_CMDS to enable efficient batched completion of I/O operations in the batch I/O framework. This completes the batch I/O infrastructure by adding the commit phase that notifies the kernel about completed I/O operations: Key features: - Batch multiple I/O completions into single UBLK_U_IO_COMMIT_IO_CMDS - Dynamic commit buffer allocation and management per thread - Automatic commit buffer preparation before processing events - Commit buffer submission after processing completed I/Os - Integration with existing completion workflows Implementation details: - ublk_batch_prep_commit() allocates and initializes commit buffers - ublk_batch_complete_io() adds completed I/Os to current batch - ublk_batch_commit_io_cmds() submits batched completions to kernel - Modified ublk_process_io() to handle batch commit lifecycle - Enhanced ublk_complete_io() to route to batch or legacy completion The commit buffer stores completion information (tag, result, buffer details) for multiple I/Os, then submits them all at once, significantly reducing syscall overhead compared to individual I/O completions. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 74 ++++++++++++++++++++++++++++++++++-- tools/testing/selftests/ublk/kublk.c | 8 +++- tools/testing/selftests/ublk/kublk.h | 69 +++++++++++++++++++++------------ 3 files changed, 122 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 079cae77add1..9c4db7335d44 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -174,7 +174,7 @@ static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, cmd->elem_bytes = elem_bytes; cmd->nr_elem = nr_elem; - user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); io_uring_sqe_set_data64(sqe, user_data); t->cmd_inflight += 1; @@ -244,9 +244,11 @@ static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) ublk_assert(cqe->res == 0); - else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) - ;//assert(cqe->res == t->commit_buf_size); - else + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else ublk_assert(0); ublk_free_commit_buf(t, buf_idx); @@ -264,3 +266,67 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, return; } } + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = t->commit.done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, t->commit.buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = t->commit.buf_idx; + sqe->addr = (__u64)t->commit.elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +static void ublk_batch_init_commit(struct ublk_thread *t, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + t->commit.q_id = t->idx; + t->commit.buf_idx = buf_idx; + t->commit.elem = ublk_get_commit_buf(t, buf_idx); + t->commit.done = 0; + t->commit.count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_batch_init_commit(t, buf_idx); +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + struct batch_commit_buf *cb = &t->commit; + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + + cb->done * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[tag]; + + ublk_assert(q->q_id == t->commit.q_id); + + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index dba912a44eb3..bf217d30c15f 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -931,7 +931,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 08320d44c7c2..5b05f6d7d808 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -190,6 +190,14 @@ struct ublk_batch_elem { __u64 buf_addr; }; +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; @@ -215,6 +223,7 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; + struct batch_commit_buf commit; struct io_uring ring; }; @@ -458,30 +467,6 @@ static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) return &q->ios[tag]; } -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) -{ - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); -} - -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) -{ - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } -} - static inline int ublk_completed_tgt_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag) { @@ -540,6 +525,42 @@ int ublk_batch_alloc_buf(struct ublk_thread *t); /* Free commit buffers and cleanup batch allocator */ void ublk_batch_free_buf(struct ublk_thread *t); +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; -- cgit v1.2.3 From cb5a6b308700c65c29baccbb6b9b07f306633ad5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:54 +0800 Subject: selftests: ublk: handle UBLK_U_IO_FETCH_IO_CMDS Add support for UBLK_U_IO_FETCH_IO_CMDS to enable efficient batch fetching of I/O commands using multishot io_uring operations. Key improvements: - Implement multishot UBLK_U_IO_FETCH_IO_CMDS for continuous command fetching - Add fetch buffer management with page-aligned, mlocked buffers - Process fetched I/O command tags from kernel-provided buffers - Integrate fetch operations with existing batch I/O infrastructure - Significantly reduce uring_cmd issuing overhead through batching The implementation uses two fetch buffers per thread with automatic requeuing to maintain continuous I/O command flow. Each fetch operation retrieves multiple command tags in a single syscall, dramatically improving performance compared to individual command fetching. Technical details: - Fetch buffers are page-aligned and mlocked for optimal performance - Uses IORING_URING_CMD_MULTISHOT for continuous operation - Automatic buffer management and requeuing on completion - Enhanced CQE handling for fetch command completions Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 136 ++++++++++++++++++++++++++++++++++- tools/testing/selftests/ublk/kublk.c | 14 +++- tools/testing/selftests/ublk/kublk.h | 13 ++++ 3 files changed, 159 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 9c4db7335d44..5f9587210b12 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -140,15 +140,63 @@ void ublk_batch_prepare(struct ublk_thread *t) t->nr_bufs); } +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + int ublk_batch_alloc_buf(struct ublk_thread *t) { + int ret; + ublk_assert(t->nr_commit_buf < 16); - return alloc_batch_commit_buf(t); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); } void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); + free_batch_fetch_buf(t); } static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, @@ -199,6 +247,76 @@ static void ublk_setup_commit_sqe(struct ublk_thread *t, cmd->flags |= t->cmd_flags; } +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) + ublk_batch_queue_fetch(t, q, i); +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; @@ -258,6 +376,9 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe) { unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { @@ -265,6 +386,19 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, ublk_batch_compl_commit_cmd(t, cqe, op); return; } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->cmd_inflight--; + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + t->cmd_inflight--; + ublk_batch_queue_fetch(t, q, buf_idx); + } } void ublk_batch_commit_io_cmds(struct ublk_thread *t) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index bf217d30c15f..c77205bac7a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -519,6 +519,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -878,7 +882,7 @@ static void ublk_handle_cqe(struct ublk_thread *t, unsigned q_id = user_data_to_q_id(cqe->user_data); unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, cqe->res, cqe->user_data, t->state); @@ -1001,9 +1005,13 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf if (!ublk_thread_batch_io(&t)) { /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(&t); - } else if (!t.idx) { + } else { + struct ublk_queue *q = &t.dev->q[t.idx]; + /* prepare all io commands in the 1st thread context */ - ublk_batch_setup_queues(&t); + if (!t.idx) + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t, q); } do { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5b05f6d7d808..950e99c02e8b 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -198,6 +198,13 @@ struct batch_commit_buf { unsigned short count; }; +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; @@ -224,6 +231,9 @@ struct ublk_thread { #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; struct batch_commit_buf commit; + /* FETCH_IO_CMDS buffer */ +#define UBLKS_T_NR_FETCH_BUF 2 + struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; struct io_uring ring; }; @@ -515,6 +525,9 @@ static inline unsigned short ublk_batch_io_buf_idx( /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); -- cgit v1.2.3 From 4968fb7cc60676040258c8867f22931c8735126f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:55 +0800 Subject: selftests: ublk: increase timeout to 150 seconds More tests need to be covered in existing generic tests, and default 45sec isn't enough, and timeout is often triggered, increase timeout by adding setting file. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 ++ tools/testing/selftests/ublk/settings | 1 + 2 files changed, 3 insertions(+) create mode 100644 tools/testing/selftests/ublk/settings (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 3a2498089b15..f2da8b403537 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -52,6 +52,8 @@ TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_FILES := settings + TEST_GEN_PROGS_EXTENDED = kublk metadata_size STANDALONE_UTILS := metadata_size.c diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings new file mode 100644 index 000000000000..682a40f1c8e6 --- /dev/null +++ b/tools/testing/selftests/ublk/settings @@ -0,0 +1 @@ +timeout=150 -- cgit v1.2.3 From 20aeab0b08a175d9ceb4ad327f55ba5c29a79888 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:56 +0800 Subject: selftests: ublk: add --batch/-b for enabling F_BATCH_IO Add --batch/-b for enabling F_BATCH_IO. Add batch_01 for covering its basic function. Add stress_08 and stress_09 for covering stress test. Add recovery test for F_BATCH_IO in generic_04 and generic_05. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 4 +++ tools/testing/selftests/ublk/kublk.c | 15 +++++++-- tools/testing/selftests/ublk/test_batch_01.sh | 32 ++++++++++++++++++ tools/testing/selftests/ublk/test_generic_04.sh | 5 +++ tools/testing/selftests/ublk/test_generic_05.sh | 5 +++ tools/testing/selftests/ublk/test_stress_08.sh | 45 +++++++++++++++++++++++++ tools/testing/selftests/ublk/test_stress_09.sh | 44 ++++++++++++++++++++++++ 7 files changed, 148 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_batch_01.sh create mode 100755 tools/testing/selftests/ublk/test_stress_08.sh create mode 100755 tools/testing/selftests/ublk/test_stress_09.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index f2da8b403537..520e18e224f2 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -25,6 +25,8 @@ TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_batch_01.sh + TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh @@ -51,6 +53,8 @@ TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_PROGS += test_stress_08.sh +TEST_PROGS += test_stress_09.sh TEST_FILES := settings diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index c77205bac7a9..5d84000872a0 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1593,7 +1593,8 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), - FEAT_NAME(UBLK_F_SAFE_STOP_DEV) + FEAT_NAME(UBLK_F_SAFE_STOP_DEV), + FEAT_NAME(UBLK_F_BATCH_IO), }; struct ublk_dev *dev; __u64 features = 0; @@ -1691,6 +1692,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); + printf("\t[--batch|-b]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1763,6 +1765,7 @@ int main(int argc, char *argv[]) { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1785,12 +1788,15 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", longopts, &option_idx)) != -1) { switch (opt) { case 'a': ctx.all = 1; break; + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'n': ctx.dev_id = strtol(optarg, NULL, 10); break; @@ -1895,6 +1901,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh new file mode 100755 index 000000000000..9fa9fff5c62f --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_01" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index baf5b156193d..be2292822bbe 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -26,6 +26,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 & ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 7b5083afc02a..9b7f71c16d82 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -30,6 +30,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 -z & ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh new file mode 100755 index 000000000000..190db0b4f2ad --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_06" +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh new file mode 100755 index 000000000000..1b6bdb31da03 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_07" +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From e8cd481cc665d5db8e918e84740db22bc213059e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:57 +0800 Subject: selftests: ublk: support arbitrary threads/queues combination Enable flexible thread-to-queue mapping in batch I/O mode to support arbitrary combinations of threads and queues, improving resource utilization and scalability. Key improvements: - Support N:M thread-to-queue mapping (previously limited to 1:1) - Dynamic buffer allocation based on actual queue assignment per thread - Thread-safe queue preparation with spinlock protection - Intelligent buffer index calculation for multi-queue scenarios - Enhanced validation for thread/queue combination constraints Implementation details: - Add q_thread_map matrix to track queue-to-thread assignments - Dynamic allocation of commit and fetch buffers per thread - Round-robin queue assignment algorithm for load balancing - Per-queue spinlock to prevent race conditions during prep - Updated buffer index calculation using queue position within thread This enables efficient configurations like: - Any other N:M combinations for optimal resource matching Testing: - Added test_batch_02.sh: 4 threads vs 1 queue - Added test_batch_03.sh: 1 thread vs 4 queues - Validates correctness across different mapping scenarios Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/batch.c | 199 ++++++++++++++++++++++---- tools/testing/selftests/ublk/kublk.c | 49 +++++-- tools/testing/selftests/ublk/kublk.h | 40 ++++-- tools/testing/selftests/ublk/test_batch_02.sh | 30 ++++ tools/testing/selftests/ublk/test_batch_03.sh | 30 ++++ 6 files changed, 302 insertions(+), 48 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_batch_02.sh create mode 100755 tools/testing/selftests/ublk/test_batch_03.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 520e18e224f2..e39a6f871fcc 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -26,6 +26,8 @@ TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh +TEST_PROGS += test_batch_02.sh +TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 5f9587210b12..a54025b00917 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -76,6 +76,7 @@ static void free_batch_commit_buf(struct ublk_thread *t) free(t->commit_buf); } allocator_deinit(&t->commit_buf_alloc); + free(t->commit); } static int alloc_batch_commit_buf(struct ublk_thread *t) @@ -84,7 +85,13 @@ static int alloc_batch_commit_buf(struct ublk_thread *t) unsigned int total = buf_size * t->nr_commit_buf; unsigned int page_sz = getpagesize(); void *buf = NULL; - int ret; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) + t->commit[j++].q_id = i; + } allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); @@ -107,6 +114,17 @@ fail: return ret; } +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->q_map[i]; + + return ret; +} + void ublk_batch_prepare(struct ublk_thread *t) { /* @@ -119,10 +137,13 @@ void ublk_batch_prepare(struct ublk_thread *t) */ struct ublk_queue *q = &t->dev->q[0]; + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); t->commit_buf_size = ublk_commit_buf_size(t); t->commit_buf_start = t->nr_bufs; - t->nr_commit_buf = 2; + t->nr_commit_buf = 2 * t->nr_queues; t->nr_bufs += t->nr_commit_buf; t->cmd_flags = 0; @@ -144,11 +165,12 @@ static void free_batch_fetch_buf(struct ublk_thread *t) { int i; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + for (i = 0; i < t->nr_fetch_bufs; i++) { io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); free(t->fetch[i].fetch_buf); } + free(t->fetch); } static int alloc_batch_fetch_buf(struct ublk_thread *t) @@ -159,7 +181,12 @@ static int alloc_batch_fetch_buf(struct ublk_thread *t) int ret; int i = 0; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { t->fetch[i].fetch_buf_size = buf_size; if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, @@ -185,7 +212,7 @@ int ublk_batch_alloc_buf(struct ublk_thread *t) { int ret; - ublk_assert(t->nr_commit_buf < 16); + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); ret = alloc_batch_commit_buf(t); if (ret) @@ -271,13 +298,20 @@ static void ublk_batch_queue_fetch(struct ublk_thread *t, t->fetch[buf_idx].fetch_buf_off = 0; } -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q) +void ublk_batch_start_fetch(struct ublk_thread *t) { int i; + int j = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) { + struct ublk_queue *q = &t->dev->q[i]; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) - ublk_batch_queue_fetch(t, q, i); + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } } static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, @@ -317,7 +351,7 @@ static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, return buf_idx; } -int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; unsigned short buf_idx = ublk_alloc_commit_buf(t); @@ -354,6 +388,22 @@ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) return 0; } +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe, unsigned op) @@ -401,59 +451,89 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, } } -void ublk_batch_commit_io_cmds(struct ublk_thread *t) +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) { struct io_uring_sqe *sqe; unsigned short buf_idx; - unsigned short nr_elem = t->commit.done; + unsigned short nr_elem = cb->done; /* nothing to commit */ if (!nr_elem) { - ublk_free_commit_buf(t, t->commit.buf_idx); + ublk_free_commit_buf(t, cb->buf_idx); return; } ublk_io_alloc_sqes(t, &sqe, 1); - buf_idx = t->commit.buf_idx; - sqe->addr = (__u64)t->commit.elem; + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; sqe->len = nr_elem * t->commit_buf_elem_size; /* commit isn't per-queue command */ - ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, t->commit_buf_elem_size, nr_elem, buf_idx); ublk_setup_commit_sqe(t, sqe, buf_idx); } -static void ublk_batch_init_commit(struct ublk_thread *t, - unsigned short buf_idx) +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) { /* so far only support 1:1 queue/thread mapping */ - t->commit.q_id = t->idx; - t->commit.buf_idx = buf_idx; - t->commit.elem = ublk_get_commit_buf(t, buf_idx); - t->commit.done = 0; - t->commit.count = t->commit_buf_size / + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / t->commit_buf_elem_size; } -void ublk_batch_prep_commit(struct ublk_thread *t) +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) { unsigned short buf_idx = ublk_alloc_commit_buf(t); ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); - ublk_batch_init_commit(t, buf_idx); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; } void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) { - struct batch_commit_buf *cb = &t->commit; - struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + - cb->done * t->commit_buf_elem_size); + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; struct ublk_io *io = &q->ios[tag]; - ublk_assert(q->q_id == t->commit.q_id); + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + + ublk_assert(q->q_id == cb->q_id); + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); elem->tag = tag; elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); elem->result = res; @@ -464,3 +544,64 @@ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, cb->done += 1; ublk_assert(cb->done <= cb->count); } + +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues) +{ + int i, j; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < nthreads; j++) { + for (i = 0; i < queues; i++) { + printf("%03u ", q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 5d84000872a0..2da37557e1a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -455,6 +455,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; @@ -521,7 +522,7 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ if (ublk_dev_batch_io(dev)) - cq_depth += dev->dev_info.queue_depth; + cq_depth += dev->dev_info.queue_depth * 2; ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | @@ -957,6 +958,7 @@ struct ublk_thread_info { sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; }; static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) @@ -970,14 +972,18 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) { int i; - /* setup all queues in the 1st thread */ for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { struct ublk_queue *q = &t->dev->q[i]; int ret; + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->q_map[i] == 0) + continue; + ret = ublk_batch_queue_prep_io_cmds(t, q); - ublk_assert(ret == 0); - ret = ublk_process_io(t); ublk_assert(ret >= 0); } } @@ -991,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf int dev_id = info->dev->dev_info.dev_id; int ret; + /* Copy per-thread queue mapping into thread-local variable */ + if (info->q_thread_map) + memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", @@ -1006,12 +1016,8 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(&t); } else { - struct ublk_queue *q = &t.dev->q[t.idx]; - - /* prepare all io commands in the 1st thread context */ - if (!t.idx) - ublk_batch_setup_queues(&t); - ublk_batch_start_fetch(&t, q); + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t); } do { @@ -1085,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; void *thread_ret; sem_t ready; int ret, i; @@ -1104,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ret) return ret; + if (ublk_dev_batch_io(dev)) { + q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); + if (!q_thread_map) { + ret = -ENOMEM; + goto fail; + } + ublk_batch_setup_map(q_thread_map, dev->nthreads, + dinfo->nr_hw_queues); + } + if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; if (ctx->no_ublk_fixed_fd) @@ -1127,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].idx = i; tinfo[i].ready = &ready; tinfo[i].extra_flags = extra_flags; + tinfo[i].q_thread_map = q_thread_map; /* * If threads are not tied 1:1 to queues, setting thread @@ -1146,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); free(affinity_buf); + free(q_thread_map); /* everything is fine now, start us */ if (ctx->recovery) @@ -1314,7 +1333,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1940,6 +1960,13 @@ int main(int argc, char *argv[]) return -EINVAL; } + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 950e99c02e8b..ca97deb5e208 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -173,13 +173,17 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; }; /* align with `ublk_elem_header` */ @@ -206,8 +210,12 @@ struct batch_fetch_buf { }; struct ublk_thread { + /* Thread-local copy of queue-to-thread mapping for this thread */ + unsigned char q_map[UBLK_MAX_QUEUES]; + struct ublk_dev *dev; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) @@ -230,10 +238,10 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; - struct batch_commit_buf commit; + struct batch_commit_buf *commit; /* FETCH_IO_CMDS buffer */ -#define UBLKS_T_NR_FETCH_BUF 2 - struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; struct io_uring ring; }; @@ -512,6 +520,21 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->q_map[q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + /* * Each IO's buffer index has to be calculated by this helper for * UBLKS_T_BATCH_IO @@ -520,14 +543,13 @@ static inline unsigned short ublk_batch_io_buf_idx( const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { - return tag; + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; } /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); /* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q); +void ublk_batch_start_fetch(struct ublk_thread *t); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); @@ -545,6 +567,8 @@ void ublk_batch_commit_io_cmds(struct ublk_thread *t); /* Add a completed I/O operation to the current batch commit buffer */ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res); +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues); static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh new file mode 100755 index 000000000000..b477f91359e1 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_02" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh new file mode 100755 index 000000000000..13a2b3d3a1b9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_03" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From e4d3fc6a22f53e5bbe51e28b43cb32bc130d9f87 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Jan 2026 17:15:44 +0800 Subject: selftests: ublk: fix test name Fix the two added test name. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_stress_08.sh | 2 +- tools/testing/selftests/ublk/test_stress_09.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh index 190db0b4f2ad..9abb50ee3d00 100755 --- a/tools/testing/selftests/ublk/test_stress_08.sh +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" +TID="stress_08" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh index 1b6bdb31da03..87b92b0a2410 100755 --- a/tools/testing/selftests/ublk/test_stress_09.sh +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" +TID="stress_09" ERR_CODE=0 ublk_io_and_kill_daemon() -- cgit v1.2.3 From 65c4b312f1f13f4b45e18387f4a8bb19c1ea3ff3 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 21 Jan 2026 09:51:54 -0700 Subject: tools: usb: usbip: remove dead-link from README Remove dead-link to Debug Tips document on usbip project wiki http://usbip.wiki.sourceforge.net/how-to-debug-usbip Tried and failed find the file in archives. It would be nice to locate the file and add this to usbip tool. Reported-by: Ignacio Hernandez-Ros Closes: https://lore.kernel.org/all/0101019bdf6ca137-60344502-51d2-4767-a34b-6a7cf1bfdf4a-000000@us-west-2.amazonses.com/ Signed-off-by: Shuah Khan Link: https://patch.msgid.link/20260121165155.13550-1-skhan@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/README | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/usb/usbip/README b/tools/usb/usbip/README index 2fc021c0eae1..11971538f03e 100644 --- a/tools/usb/usbip/README +++ b/tools/usb/usbip/README @@ -241,8 +241,6 @@ Detach the imported device: [Checklist] - - See 'Debug Tips' on the project wiki. - - http://usbip.wiki.sourceforge.net/how-to-debug-usbip - usbip-host.ko must be bound to the target device. - See /sys/kernel/debug/usb/devices and find "Driver=..." lines of the device. - Target USB gadget must be bound to vudc -- cgit v1.2.3 From e396a74222654486d6ab45dca5d0c54c408b8b91 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Thu, 22 Jan 2026 13:35:50 +0800 Subject: KVM: selftests: Add -U_FORTIFY_SOURCE to avoid some unpredictable test failures Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is automatically enabled at -O1 or above. This results in some fortified version of definitions of standard library functions are included. While linker resolves the symbols, the fortified versions might override the definitions in lib/string_override.c and reference to those PLT entries in GLIBC. This is not a problem for the code in host, but it is a disaster for the guest code. E.g., if build and run x86/nested_emulation_test on Ubuntu 24.04 will encounter a L1 #PF due to memset() reference to __memset_chk@plt. The option -fno-builtin-memset is not helpful here, because those fortified versions are not built-in but some definitions which are included by header, they are for different intentions. In order to eliminate the unpredictable behaviors may vary depending on the linker and platform, add the "-U_FORTIFY_SOURCE" into CFLAGS to prevent from introducing the fortified definitions. Signed-off-by: Zhiquan Li Link: https://patch.msgid.link/20260122053551.548229-1-zhiquan_li@163.com Fixes: 6b6f71484bf4 ("KVM: selftests: Implement memcmp(), memcpy(), and memset() for guest use") Cc: stable@vger.kernel.org [sean: tag for stable] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..d45bf4ccb3bf 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -251,6 +251,7 @@ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ + -U_FORTIFY_SOURCE \ -fno-builtin-memcmp -fno-builtin-memcpy \ -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -fno-strict-aliasing \ -- cgit v1.2.3 From 5094f7d5ff2318edfe6f2a9632b31f0ddefd6ee4 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 18 Jan 2026 00:26:21 +0100 Subject: tools/docs: sphinx-build-wrapper: generate rust docs only once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the rust docs are generated for each entry in SPHINXDIRS. This is unnecessary as they will be the same for each one. Move the generation, so it is executed only once. Signed-off-by: Thomas Weißschuh Reviewed-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <20260118-docs-spurious-rust-v1-1-998e14b9ed9e@weissschuh.net> --- tools/docs/sphinx-build-wrapper | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index 9f1ae1485f84..2f65ddc85955 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -352,23 +352,6 @@ class SphinxBuilder: except (OSError, IOError) as e: print(f"Warning: Failed to copy CSS: {e}", file=sys.stderr) - if self.rustdoc: - print("Building rust docs") - if "MAKE" in self.env: - cmd = [self.env["MAKE"]] - else: - cmd = ["make", "LLVM=1"] - - cmd += [ "rustdoc"] - if self.verbose: - print(" ".join(cmd)) - - try: - subprocess.run(cmd, check=True) - except subprocess.CalledProcessError as e: - print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?", - file=sys.stderr) - def build_pdf_file(self, latex_cmd, from_dir, path): """Builds a single pdf file using latex_cmd""" try: @@ -785,6 +768,23 @@ class SphinxBuilder: elif target == "infodocs": self.handle_info(output_dirs) + if self.rustdoc and target in ["htmldocs", "epubdocs"]: + print("Building rust docs") + if "MAKE" in self.env: + cmd = [self.env["MAKE"]] + else: + cmd = ["make", "LLVM=1"] + + cmd += [ "rustdoc"] + if self.verbose: + print(" ".join(cmd)) + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?", + file=sys.stderr) + def jobs_type(value): """ Handle valid values for -j. Accepts Sphinx "-jauto", plus a number -- cgit v1.2.3 From 2d652135a16b413e38e6d7fed5244690d853756b Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 18 Jan 2026 00:26:22 +0100 Subject: tools/docs: sphinx-build-wrapper: make 'rustdoc' a local variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users of this variable are now in the same method. Demote the instance variable to a local one. Signed-off-by: Thomas Weißschuh Reviewed-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <20260118-docs-spurious-rust-v1-2-998e14b9ed9e@weissschuh.net> --- tools/docs/sphinx-build-wrapper | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index 2f65ddc85955..76dfd5cbf178 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -123,12 +123,10 @@ class SphinxBuilder: """ Checks if Rust is enabled """ - self.rustdoc = False - config = os.path.join(self.srctree, ".config") if not os.path.isfile(config): - return + return False re_rust = re.compile(r"CONFIG_RUST=(m|y)") @@ -136,11 +134,13 @@ class SphinxBuilder: with open(config, "r", encoding="utf-8") as fp: for line in fp: if re_rust.match(line): - self.rustdoc = True - return + return True except OSError as e: print(f"Failed to open {config}", file=sys.stderr) + return False + + return False def get_sphinx_extra_opts(self, n_jobs): """ @@ -259,8 +259,6 @@ class SphinxBuilder: self.get_sphinx_extra_opts(n_jobs) - self.check_rust() - # # If venv command line argument is specified, run Sphinx from venv # @@ -680,7 +678,8 @@ class SphinxBuilder: args.extend(["-D", f"latex_elements.papersize={paper}paper"]) - if self.rustdoc: + rustdoc = self.check_rust() + if rustdoc: args.extend(["-t", "rustdoc"]) if not sphinxdirs: @@ -768,7 +767,7 @@ class SphinxBuilder: elif target == "infodocs": self.handle_info(output_dirs) - if self.rustdoc and target in ["htmldocs", "epubdocs"]: + if rustdoc and target in ["htmldocs", "epubdocs"]: print("Building rust docs") if "MAKE" in self.env: cmd = [self.env["MAKE"]] -- cgit v1.2.3 From 6f9a96cc96ea405e0f80fede761dc415e33364c7 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 18 Jan 2026 00:26:23 +0100 Subject: tools/docs: sphinx-build-wrapper: compute sphinxdirs_list earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An upcoming patch will require sphinxdirs_list to be available before the call to check_rust(). Move it up in the function. Signed-off-by: Thomas Weißschuh Reviewed-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <20260118-docs-spurious-rust-v1-3-998e14b9ed9e@weissschuh.net> --- tools/docs/sphinx-build-wrapper | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index 76dfd5cbf178..04eb300dab4a 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -670,6 +670,19 @@ class SphinxBuilder: if kerneldoc.startswith(self.srctree): kerneldoc = os.path.relpath(kerneldoc, self.srctree) + if not sphinxdirs: + sphinxdirs = os.environ.get("SPHINXDIRS", ".") + + # + # sphinxdirs can be a list or a whitespace-separated string + # + sphinxdirs_list = [] + for sphinxdir in sphinxdirs: + if isinstance(sphinxdir, list): + sphinxdirs_list += sphinxdir + else: + sphinxdirs_list += sphinxdir.split() + args = [ "-b", builder, "-c", docs_dir ] if builder == "latex": @@ -682,9 +695,6 @@ class SphinxBuilder: if rustdoc: args.extend(["-t", "rustdoc"]) - if not sphinxdirs: - sphinxdirs = os.environ.get("SPHINXDIRS", ".") - # # The sphinx-build tool has a bug: internally, it tries to set # locale with locale.setlocale(locale.LC_ALL, ''). This causes a @@ -695,16 +705,6 @@ class SphinxBuilder: except locale.Error: self.env["LC_ALL"] = "C" - # - # sphinxdirs can be a list or a whitespace-separated string - # - sphinxdirs_list = [] - for sphinxdir in sphinxdirs: - if isinstance(sphinxdir, list): - sphinxdirs_list += sphinxdir - else: - sphinxdirs_list += sphinxdir.split() - # # Step 1: Build each directory in separate. # -- cgit v1.2.3 From ffb569d59c253399efb2345ddfefe7929cd7e2a8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 18 Jan 2026 00:26:24 +0100 Subject: tools/docs: sphinx-build-wrapper: only generate rust docs when requested MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the user explicitly specifies SPHINXDIRS to build a specific subdirectory it is unexpected that the rust docs are also generated. Especially as their generation may dominate the execution time. Only generate the rust docs when they are part of the SPHINXDIRS requested by the user. 'rust/rustdocs' is not considered, as it is not a valid SPHINXDIRS anyways. Signed-off-by: Thomas Weißschuh Reviewed-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <20260118-docs-spurious-rust-v1-4-998e14b9ed9e@weissschuh.net> --- tools/docs/sphinx-build-wrapper | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index 04eb300dab4a..78ff7ac202ef 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -119,12 +119,15 @@ class SphinxBuilder: return path - def check_rust(self): + def check_rust(self, sphinxdirs): """ Checks if Rust is enabled """ config = os.path.join(self.srctree, ".config") + if not {'.', 'rust'}.intersection(sphinxdirs): + return False + if not os.path.isfile(config): return False @@ -691,7 +694,7 @@ class SphinxBuilder: args.extend(["-D", f"latex_elements.papersize={paper}paper"]) - rustdoc = self.check_rust() + rustdoc = self.check_rust(sphinxdirs_list) if rustdoc: args.extend(["-t", "rustdoc"]) -- cgit v1.2.3 From 4d7f6319faf209a9472f7bc5df98706b723b9464 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:09 +0100 Subject: docs: kdoc: latex_fonts: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Among the changes, it had to place the xml template inside a code block, as otherwise doc build would break. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <6e0eb2e245eae9b4f39cf231dee32df00b9e8b7b.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/latex_fonts.py | 95 +++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/latex_fonts.py b/tools/lib/python/kdoc/latex_fonts.py index 29317f8006ea..1d04cbda169f 100755 --- a/tools/lib/python/kdoc/latex_fonts.py +++ b/tools/lib/python/kdoc/latex_fonts.py @@ -5,12 +5,13 @@ # Ported to Python by (c) Mauro Carvalho Chehab, 2025 """ -Detect problematic Noto CJK variable fonts. +Detect problematic Noto CJK variable fonts +========================================== -For "make pdfdocs", reports of build errors of translations.pdf started -arriving early 2024 [1, 2]. It turned out that Fedora and openSUSE -tumbleweed have started deploying variable-font [3] format of "Noto CJK" -fonts [4, 5]. For PDF, a LaTeX package named xeCJK is used for CJK +For ``make pdfdocs``, reports of build errors of translations.pdf started +arriving early 2024 [1]_ [2]_. It turned out that Fedora and openSUSE +tumbleweed have started deploying variable-font [3]_ format of "Noto CJK" +fonts [4]_ [5]_. For PDF, a LaTeX package named xeCJK is used for CJK (Chinese, Japanese, Korean) pages. xeCJK requires XeLaTeX/XeTeX, which does not (and likely never will) understand variable fonts for historical reasons. @@ -25,68 +26,77 @@ This script is invoked from the error path of "make pdfdocs" and emits suggestions if variable-font files of "Noto CJK" fonts are in the list of fonts accessible from XeTeX. -References: -[1]: https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/ -[2]: https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/ -[3]: https://en.wikipedia.org/wiki/Variable_font -[4]: https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts -[5]: https://build.opensuse.org/request/show/1157217 +.. [1] https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/ +.. [2] https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/ +.. [3] https://en.wikipedia.org/wiki/Variable_font +.. [4] https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts +.. [5] https://build.opensuse.org/request/show/1157217 -#=========================================================================== Workarounds for building translations.pdf -#=========================================================================== +----------------------------------------- * Denylist "variable font" Noto CJK fonts. + - Create $HOME/deny-vf/fontconfig/fonts.conf from template below, with tweaks if necessary. Remove leading "". + - Path of fontconfig/fonts.conf can be overridden by setting an env variable FONTS_CONF_DENY_VF. - * Template: ------------------------------------------------------------------ - - - - - - - - /usr/share/fonts/google-noto-*-cjk-vf-fonts - - /usr/share/fonts/truetype/Noto*CJK*-VF.otf - - - ------------------------------------------------------------------ + * Template:: + + + + + + + + + /usr/share/fonts/google-noto-*-cjk-vf-fonts + + /usr/share/fonts/truetype/Noto*CJK*-VF.otf + + + The denylisting is activated for "make pdfdocs". * For skipping CJK pages in PDF + - Uninstall texlive-xecjk. Denylisting is not needed in this case. * For printing CJK pages in PDF + - Need non-variable "Noto CJK" fonts. + * Fedora + - google-noto-sans-cjk-fonts - google-noto-serif-cjk-fonts + * openSUSE tumbleweed + - Non-variable "Noto CJK" fonts are not available as distro packages as of April, 2024. Fetch a set of font files from upstream Noto CJK Font released at: + https://github.com/notofonts/noto-cjk/tree/main/Sans#super-otc + and at: + https://github.com/notofonts/noto-cjk/tree/main/Serif#super-otc - , then uncompress and deploy them. + + then uncompress and deploy them. - Remember to update fontconfig cache by running fc-cache. -!!! Caution !!! +.. caution:: Uninstalling "variable font" packages can be dangerous. They might be depended upon by other packages important for your work. Denylisting should be less invasive, as it is effective only while @@ -115,10 +125,15 @@ class LatexFontChecker: self.re_cjk = re.compile(r"([^:]+):\s*Noto\s+(Sans|Sans Mono|Serif) CJK") def description(self): + """ + Returns module description. + """ return __doc__ def get_noto_cjk_vf_fonts(self): - """Get Noto CJK fonts""" + """ + Get Noto CJK fonts. + """ cjk_fonts = set() cmd = ["fc-list", ":", "file", "family", "variable"] @@ -143,7 +158,9 @@ class LatexFontChecker: return sorted(cjk_fonts) def check(self): - """Check for problems with CJK fonts""" + """ + Check for problems with CJK fonts. + """ fonts = textwrap.indent("\n".join(self.get_noto_cjk_vf_fonts()), " ") if not fonts: -- cgit v1.2.3 From 8d08c7c6ffc14abe584738843c8577c691ffcf22 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:10 +0100 Subject: docs: kdoc_files: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <75d58878ad6f83f24f1c0ce9e04301a000ecbaa3.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/kdoc_files.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_files.py b/tools/lib/python/kdoc/kdoc_files.py index bfe02baf1606..022487ea2cc6 100644 --- a/tools/lib/python/kdoc/kdoc_files.py +++ b/tools/lib/python/kdoc/kdoc_files.py @@ -5,7 +5,8 @@ # pylint: disable=R0903,R0913,R0914,R0917 """ -Parse lernel-doc tags on multiple kernel source files. +Classes for navigating through the files that kernel-doc needs to handle +to generate documentation. """ import argparse @@ -43,7 +44,7 @@ class GlobSourceFiles: self.srctree = srctree def _parse_dir(self, dirname): - """Internal function to parse files recursively""" + """Internal function to parse files recursively.""" with os.scandir(dirname) as obj: for entry in obj: @@ -65,7 +66,7 @@ class GlobSourceFiles: def parse_files(self, file_list, file_not_found_cb): """ Define an iterator to parse all source files from file_list, - handling directories if any + handling directories if any. """ if not file_list: @@ -91,18 +92,18 @@ class KernelFiles(): There are two type of parsers defined here: - self.parse_file(): parses both kernel-doc markups and - EXPORT_SYMBOL* macros; - - self.process_export_file(): parses only EXPORT_SYMBOL* macros. + ``EXPORT_SYMBOL*`` macros; + - self.process_export_file(): parses only ``EXPORT_SYMBOL*`` macros. """ def warning(self, msg): - """Ancillary routine to output a warning and increment error count""" + """Ancillary routine to output a warning and increment error count.""" self.config.log.warning(msg) self.errors += 1 def error(self, msg): - """Ancillary routine to output an error and increment error count""" + """Ancillary routine to output an error and increment error count.""" self.config.log.error(msg) self.errors += 1 @@ -128,7 +129,7 @@ class KernelFiles(): def process_export_file(self, fname): """ - Parses EXPORT_SYMBOL* macros from a single Kernel source file. + Parses ``EXPORT_SYMBOL*`` macros from a single Kernel source file. """ # Prevent parsing the same file twice if results are cached @@ -157,7 +158,7 @@ class KernelFiles(): wcontents_before_sections=False, logger=None): """ - Initialize startup variables and parse all files + Initialize startup variables and parse all files. """ if not verbose: @@ -213,7 +214,7 @@ class KernelFiles(): def parse(self, file_list, export_file=None): """ - Parse all files + Parse all files. """ glob = GlobSourceFiles(srctree=self.config.src_tree) @@ -242,7 +243,7 @@ class KernelFiles(): filenames=None, export_file=None): """ Interacts over the kernel-doc results and output messages, - returning kernel-doc markups on each interaction + returning kernel-doc markups on each interaction. """ self.out_style.set_config(self.config) -- cgit v1.2.3 From f40bba94a4db923a0ef0355b3055403fc9975729 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:11 +0100 Subject: docs: kdoc_item: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <65a7c6bb318e7a8cbf5c115903d507568099151a.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/kdoc_item.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_item.py b/tools/lib/python/kdoc/kdoc_item.py index 19805301cb2c..2b8a93f79716 100644 --- a/tools/lib/python/kdoc/kdoc_item.py +++ b/tools/lib/python/kdoc/kdoc_item.py @@ -4,7 +4,16 @@ # then pass into the output modules. # +""" +Data class to store a kernel-doc Item. +""" + class KdocItem: + """ + A class that will, eventually, encapsulate all of the parsed data that we + then pass into the output modules. + """ + def __init__(self, name, fname, type, start_line, **other_stuff): self.name = name self.fname = fname @@ -24,6 +33,9 @@ class KdocItem: self.other_stuff = other_stuff def get(self, key, default = None): + """ + Get a value from optional keys. + """ return self.other_stuff.get(key, default) def __getitem__(self, key): @@ -33,10 +45,16 @@ class KdocItem: # Tracking of section and parameter information. # def set_sections(self, sections, start_lines): + """ + Set sections and start lines. + """ self.sections = sections self.section_start_lines = start_lines def set_params(self, names, descs, types, starts): + """ + Set parameter list: names, descriptions, types and start lines. + """ self.parameterlist = names self.parameterdescs = descs self.parametertypes = types -- cgit v1.2.3 From 50206750e08e3087af74aa984d850df978b2554a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:12 +0100 Subject: docs: kdoc_parser: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: --- tools/lib/python/kdoc/kdoc_parser.py | 171 +++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 78 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index a9a37519145d..bd88a2cd60c3 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -5,11 +5,8 @@ # pylint: disable=C0301,C0302,R0904,R0912,R0913,R0914,R0915,R0917,R1702 """ -kdoc_parser -=========== - -Read a C language source or header FILE and extract embedded -documentation comments +Classes and functions related to reading a C language source or header FILE +and extract embedded documentation comments from it. """ import sys @@ -195,25 +192,28 @@ function_xforms = [ ] # -# Apply a set of transforms to a block of text. +# Ancillary functions # + def apply_transforms(xforms, text): + """ + Apply a set of transforms to a block of text. + """ for search, subst in xforms: text = search.sub(subst, text) return text -# -# A little helper to get rid of excess white space -# multi_space = KernRe(r'\s\s+') def trim_whitespace(s): + """ + A little helper to get rid of excess white space. + """ return multi_space.sub(' ', s.strip()) -# -# Remove struct/enum members that have been marked "private". -# def trim_private_members(text): - # + """ + Remove ``struct``/``enum`` members that have been marked "private". + """ # First look for a "public:" block that ends a private region, then # handle the "private until the end" case. # @@ -226,20 +226,21 @@ def trim_private_members(text): class state: """ - State machine enums + States used by the parser's state machine. """ # Parser states - NORMAL = 0 # normal code - NAME = 1 # looking for function name - DECLARATION = 2 # We have seen a declaration which might not be done - BODY = 3 # the body of the comment - SPECIAL_SECTION = 4 # doc section ending with a blank line - PROTO = 5 # scanning prototype - DOCBLOCK = 6 # documentation block - INLINE_NAME = 7 # gathering doc outside main block - INLINE_TEXT = 8 # reading the body of inline docs - + NORMAL = 0 #: Normal code. + NAME = 1 #: Looking for function name. + DECLARATION = 2 #: We have seen a declaration which might not be done. + BODY = 3 #: The body of the comment. + SPECIAL_SECTION = 4 #: Doc section ending with a blank line. + PROTO = 5 #: Scanning prototype. + DOCBLOCK = 6 #: Documentation block. + INLINE_NAME = 7 #: Gathering doc outside main block. + INLINE_TEXT = 8 #: Reading the body of inline docs. + + #: Names for each parser state. name = [ "NORMAL", "NAME", @@ -253,9 +254,12 @@ class state: ] -SECTION_DEFAULT = "Description" # default section +SECTION_DEFAULT = "Description" #: Default section. class KernelEntry: + """ + Encapsulates a Kernel documentation entry. + """ def __init__(self, config, fname, ln): self.config = config @@ -288,9 +292,11 @@ class KernelEntry: # Management of section contents # def add_text(self, text): + """Add a new text to the entry contents list.""" self._contents.append(text) def contents(self): + """Returns a string with all content texts that were added.""" return '\n'.join(self._contents) + '\n' # TODO: rename to emit_message after removal of kernel-doc.pl @@ -309,10 +315,10 @@ class KernelEntry: self.warnings.append(log_msg) return - # - # Begin a new section. - # def begin_section(self, line_no, title = SECTION_DEFAULT, dump = False): + """ + Begin a new section. + """ if dump: self.dump_section(start_new = True) self.section = title @@ -366,11 +372,13 @@ class KernelDoc: documentation comments. """ - # Section names - + #: Name of context section. section_context = "Context" + + #: Name of return section. section_return = "Return" + #: String to write when a parameter is not described. undescribed = "-- undescribed --" def __init__(self, config, fname): @@ -416,7 +424,7 @@ class KernelDoc: def dump_section(self, start_new=True): """ - Dumps section contents to arrays/hashes intended for that purpose. + Dump section contents to arrays/hashes intended for that purpose. """ if self.entry: @@ -425,9 +433,9 @@ class KernelDoc: # TODO: rename it to store_declaration after removal of kernel-doc.pl def output_declaration(self, dtype, name, **args): """ - Stores the entry into an entry array. + Store the entry into an entry array. - The actual output and output filters will be handled elsewhere + The actual output and output filters will be handled elsewhere. """ item = KdocItem(name, self.fname, dtype, @@ -663,10 +671,12 @@ class KernelDoc: self.emit_msg(ln, f"No description found for return value of '{declaration_name}'") - # - # Split apart a structure prototype; returns (struct|union, name, members) or None - # def split_struct_proto(self, proto): + """ + Split apart a structure prototype; returns (struct|union, name, + members) or ``None``. + """ + type_pattern = r'(struct|union)' qualifiers = [ "__attribute__", @@ -685,21 +695,26 @@ class KernelDoc: if r.search(proto): return (r.group(1), r.group(3), r.group(2)) return None - # - # Rewrite the members of a structure or union for easier formatting later on. - # Among other things, this function will turn a member like: - # - # struct { inner_members; } foo; - # - # into: - # - # struct foo; inner_members; - # + def rewrite_struct_members(self, members): + """ + Process ``struct``/``union`` members from the most deeply nested + outward. + + Rewrite the members of a ``struct`` or ``union`` for easier formatting + later on. Among other things, this function will turn a member like:: + + struct { inner_members; } foo; + + into:: + + struct foo; inner_members; + """ + # - # Process struct/union members from the most deeply nested outward. The - # trick is in the ^{ below - it prevents a match of an outer struct/union - # until the inner one has been munged (removing the "{" in the process). + # The trick is in the ``^{`` below - it prevents a match of an outer + # ``struct``/``union`` until the inner one has been munged + # (removing the ``{`` in the process). # struct_members = KernRe(r'(struct|union)' # 0: declaration type r'([^\{\};]+)' # 1: possible name @@ -777,11 +792,12 @@ class KernelDoc: tuples = struct_members.findall(members) return members - # - # Format the struct declaration into a standard form for inclusion in the - # resulting docs. - # def format_struct_decl(self, declaration): + """ + Format the ``struct`` declaration into a standard form for inclusion + in the resulting docs. + """ + # # Insert newlines, get rid of extra spaces. # @@ -815,7 +831,7 @@ class KernelDoc: def dump_struct(self, ln, proto): """ - Store an entry for a struct or union + Store an entry for a ``struct`` or ``union`` """ # # Do the basic parse to get the pieces of the declaration. @@ -857,7 +873,7 @@ class KernelDoc: def dump_enum(self, ln, proto): """ - Stores an enum inside self.entries array. + Store an ``enum`` inside self.entries array. """ # # Strip preprocessor directives. Note that this depends on the @@ -1004,7 +1020,7 @@ class KernelDoc: def dump_declaration(self, ln, prototype): """ - Stores a data declaration inside self.entries array. + Store a data declaration inside self.entries array. """ if self.entry.decl_type == "enum": @@ -1021,7 +1037,7 @@ class KernelDoc: def dump_function(self, ln, prototype): """ - Stores a function or function macro inside self.entries array. + Store a function or function macro inside self.entries array. """ found = func_macro = False @@ -1122,7 +1138,7 @@ class KernelDoc: def dump_typedef(self, ln, proto): """ - Stores a typedef inside self.entries array. + Store a ``typedef`` inside self.entries array. """ # # We start by looking for function typedefs. @@ -1176,7 +1192,7 @@ class KernelDoc: @staticmethod def process_export(function_set, line): """ - process EXPORT_SYMBOL* tags + process ``EXPORT_SYMBOL*`` tags This method doesn't use any variable from the class, so declare it with a staticmethod decorator. @@ -1207,7 +1223,7 @@ class KernelDoc: def process_normal(self, ln, line): """ - STATE_NORMAL: looking for the /** to begin everything. + STATE_NORMAL: looking for the ``/**`` to begin everything. """ if not doc_start.match(line): @@ -1297,10 +1313,10 @@ class KernelDoc: else: self.emit_msg(ln, f"Cannot find identifier on line:\n{line}") - # - # Helper function to determine if a new section is being started. - # def is_new_section(self, ln, line): + """ + Helper function to determine if a new section is being started. + """ if doc_sect.search(line): self.state = state.BODY # @@ -1332,10 +1348,10 @@ class KernelDoc: return True return False - # - # Helper function to detect (and effect) the end of a kerneldoc comment. - # def is_comment_end(self, ln, line): + """ + Helper function to detect (and effect) the end of a kerneldoc comment. + """ if doc_end.search(line): self.dump_section() @@ -1354,7 +1370,7 @@ class KernelDoc: def process_decl(self, ln, line): """ - STATE_DECLARATION: We've seen the beginning of a declaration + STATE_DECLARATION: We've seen the beginning of a declaration. """ if self.is_new_section(ln, line) or self.is_comment_end(ln, line): return @@ -1383,7 +1399,7 @@ class KernelDoc: def process_special(self, ln, line): """ - STATE_SPECIAL_SECTION: a section ending with a blank line + STATE_SPECIAL_SECTION: a section ending with a blank line. """ # # If we have hit a blank line (only the " * " marker), then this @@ -1473,7 +1489,7 @@ class KernelDoc: def syscall_munge(self, ln, proto): # pylint: disable=W0613 """ - Handle syscall definitions + Handle syscall definitions. """ is_void = False @@ -1512,7 +1528,7 @@ class KernelDoc: def tracepoint_munge(self, ln, proto): """ - Handle tracepoint definitions + Handle tracepoint definitions. """ tracepointname = None @@ -1548,7 +1564,7 @@ class KernelDoc: return proto def process_proto_function(self, ln, line): - """Ancillary routine to process a function prototype""" + """Ancillary routine to process a function prototype.""" # strip C99-style comments to end of line line = KernRe(r"//.*$", re.S).sub('', line) @@ -1593,7 +1609,9 @@ class KernelDoc: self.reset_state(ln) def process_proto_type(self, ln, line): - """Ancillary routine to process a type""" + """ + Ancillary routine to process a type. + """ # Strip C99-style comments and surrounding whitespace line = KernRe(r"//.*$", re.S).sub('', line).strip() @@ -1647,7 +1665,7 @@ class KernelDoc: self.process_proto_type(ln, line) def process_docblock(self, ln, line): - """STATE_DOCBLOCK: within a DOC: block.""" + """STATE_DOCBLOCK: within a ``DOC:`` block.""" if doc_end.search(line): self.dump_section() @@ -1659,7 +1677,7 @@ class KernelDoc: def parse_export(self): """ - Parses EXPORT_SYMBOL* macros from a single Kernel source file. + Parses ``EXPORT_SYMBOL*`` macros from a single Kernel source file. """ export_table = set() @@ -1676,10 +1694,7 @@ class KernelDoc: return export_table - # - # The state/action table telling us which function to invoke in - # each state. - # + #: The state/action table telling us which function to invoke in each state. state_actions = { state.NORMAL: process_normal, state.NAME: process_name, -- cgit v1.2.3 From 245f1ab2c9bce18a9467b4ef892570dd83b049d2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:13 +0100 Subject: docs: kdoc_output: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: --- tools/lib/python/kdoc/kdoc_output.py | 60 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py index d2bf94275d65..4210b91dde5f 100644 --- a/tools/lib/python/kdoc/kdoc_output.py +++ b/tools/lib/python/kdoc/kdoc_output.py @@ -5,14 +5,16 @@ # pylint: disable=C0301,R0902,R0911,R0912,R0913,R0914,R0915,R0917 """ -Implement output filters to print kernel-doc documentation. +Classes to implement output filters to print kernel-doc documentation. -The implementation uses a virtual base class (OutputFormat) which +The implementation uses a virtual base class ``OutputFormat``. It contains dispatches to virtual methods, and some code to filter out output messages. The actual implementation is done on one separate class per each type -of output. Currently, there are output classes for ReST and man/troff. +of output, e.g. ``RestFormat`` and ``ManFormat`` classes. + +Currently, there are output classes for ReST and man/troff. """ import os @@ -54,16 +56,19 @@ class OutputFormat: """ # output mode. - OUTPUT_ALL = 0 # output all symbols and doc sections - OUTPUT_INCLUDE = 1 # output only specified symbols - OUTPUT_EXPORTED = 2 # output exported symbols - OUTPUT_INTERNAL = 3 # output non-exported symbols + OUTPUT_ALL = 0 #: Output all symbols and doc sections. + OUTPUT_INCLUDE = 1 #: Output only specified symbols. + OUTPUT_EXPORTED = 2 #: Output exported symbols. + OUTPUT_INTERNAL = 3 #: Output non-exported symbols. - # Virtual member to be overridden at the inherited classes + #: Highlights to be used in ReST format. highlights = [] + #: Blank line character. + blankline = "" + def __init__(self): - """Declare internal vars and set mode to OUTPUT_ALL""" + """Declare internal vars and set mode to ``OUTPUT_ALL``.""" self.out_mode = self.OUTPUT_ALL self.enable_lineno = None @@ -128,7 +133,7 @@ class OutputFormat: self.config.warning(log_msg) def check_doc(self, name, args): - """Check if DOC should be output""" + """Check if DOC should be output.""" if self.no_doc_sections: return False @@ -177,7 +182,7 @@ class OutputFormat: def msg(self, fname, name, args): """ - Handles a single entry from kernel-doc parser + Handles a single entry from kernel-doc parser. """ self.data = "" @@ -220,30 +225,31 @@ class OutputFormat: # Virtual methods to be overridden by inherited classes # At the base class, those do nothing. def set_symbols(self, symbols): - """Get a list of all symbols from kernel_doc""" + """Get a list of all symbols from kernel_doc.""" def out_doc(self, fname, name, args): - """Outputs a DOC block""" + """Outputs a DOC block.""" def out_function(self, fname, name, args): - """Outputs a function""" + """Outputs a function.""" def out_enum(self, fname, name, args): - """Outputs an enum""" + """Outputs an enum.""" def out_var(self, fname, name, args): - """Outputs a variable""" + """Outputs a variable.""" def out_typedef(self, fname, name, args): - """Outputs a typedef""" + """Outputs a typedef.""" def out_struct(self, fname, name, args): - """Outputs a struct""" + """Outputs a struct.""" class RestFormat(OutputFormat): - """Consts and functions used by ReST output""" + """Consts and functions used by ReST output.""" + #: Highlights to be used in ReST format highlights = [ (type_constant, r"``\1``"), (type_constant2, r"``\1``"), @@ -263,9 +269,13 @@ class RestFormat(OutputFormat): (type_fallback, r":c:type:`\1`"), (type_param_ref, r"**\1\2**") ] + blankline = "\n" + #: Sphinx literal block regex. sphinx_literal = KernRe(r'^[^.].*::$', cache=False) + + #: Sphinx code block regex. sphinx_cblock = KernRe(r'^\.\.\ +code-block::', cache=False) def __init__(self): @@ -280,7 +290,7 @@ class RestFormat(OutputFormat): self.lineprefix = "" def print_lineno(self, ln): - """Outputs a line number""" + """Outputs a line number.""" if self.enable_lineno and ln is not None: ln += 1 @@ -289,7 +299,7 @@ class RestFormat(OutputFormat): def output_highlight(self, args): """ Outputs a C symbol that may require being converted to ReST using - the self.highlights variable + the self.highlights variable. """ input_text = args @@ -570,7 +580,7 @@ class RestFormat(OutputFormat): class ManFormat(OutputFormat): - """Consts and functions used by man pages output""" + """Consts and functions used by man pages output.""" highlights = ( (type_constant, r"\1"), @@ -587,6 +597,7 @@ class ManFormat(OutputFormat): ) blankline = "" + #: Allowed timestamp formats. date_formats = [ "%a %b %d %H:%M:%S %Z %Y", "%a %b %d %H:%M:%S %Y", @@ -653,7 +664,7 @@ class ManFormat(OutputFormat): self.symbols = symbols def out_tail(self, fname, name, args): - """Adds a tail for all man pages""" + """Adds a tail for all man pages.""" # SEE ALSO section self.data += f'.SH "SEE ALSO"' + "\n.PP\n" @@ -689,7 +700,7 @@ class ManFormat(OutputFormat): def output_highlight(self, block): """ Outputs a C symbol that may require being highlighted with - self.highlights variable using troff syntax + self.highlights variable using troff syntax. """ contents = self.highlight_block(block) @@ -720,7 +731,6 @@ class ManFormat(OutputFormat): self.output_highlight(text) def out_function(self, fname, name, args): - """output function in man""" out_name = self.arg_name(args, name) -- cgit v1.2.3 From b0b88915c83c2888e60a27c4914d10486f34fe3a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:14 +0100 Subject: docs: kdoc_re: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <14a12a43144d52345bfd405d0401d246f0885acf.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/kdoc_re.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py index 2dfa1bf83d64..2816bd9f90f8 100644 --- a/tools/lib/python/kdoc/kdoc_re.py +++ b/tools/lib/python/kdoc/kdoc_re.py @@ -51,6 +51,9 @@ class KernRe: """ return self.regex.pattern + def __repr__(self): + return f're.compile("{self.regex.pattern}")' + def __add__(self, other): """ Allows adding two regular expressions into one. @@ -61,7 +64,7 @@ class KernRe: def match(self, string): """ - Handles a re.match storing its results + Handles a re.match storing its results. """ self.last_match = self.regex.match(string) @@ -69,7 +72,7 @@ class KernRe: def search(self, string): """ - Handles a re.search storing its results + Handles a re.search storing its results. """ self.last_match = self.regex.search(string) @@ -77,28 +80,28 @@ class KernRe: def findall(self, string): """ - Alias to re.findall + Alias to re.findall. """ return self.regex.findall(string) def split(self, string): """ - Alias to re.split + Alias to re.split. """ return self.regex.split(string) def sub(self, sub, string, count=0): """ - Alias to re.sub + Alias to re.sub. """ return self.regex.sub(sub, string, count=count) def group(self, num): """ - Returns the group results of the last match + Returns the group results of the last match. """ return self.last_match.group(num) @@ -110,7 +113,7 @@ class NestedMatch: even harder on Python with its normal re module, as there are several advanced regular expressions that are missing. - This is the case of this pattern: + This is the case of this pattern:: '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' @@ -121,6 +124,7 @@ class NestedMatch: replace nested expressions. The original approach was suggested by: + https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex Although I re-implemented it to make it more generic and match 3 types -- cgit v1.2.3 From e68c84b9f3ba138878581a9f36a02c67d2ae20d4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:15 +0100 Subject: docs: kdoc: parse_data_structs: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <76ead85b4c13a8038180a792e270c3691d26cd25.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/parse_data_structs.py | 62 ++++++++++++++++++----------- 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/parse_data_structs.py b/tools/lib/python/kdoc/parse_data_structs.py index 25361996cd20..9941cd19032e 100755 --- a/tools/lib/python/kdoc/parse_data_structs.py +++ b/tools/lib/python/kdoc/parse_data_structs.py @@ -9,12 +9,12 @@ Parse a source file or header, creating ReStructured Text cross references. It accepts an optional file to change the default symbol reference or to suppress symbols from the output. -It is capable of identifying defines, functions, structs, typedefs, -enums and enum symbols and create cross-references for all of them. +It is capable of identifying ``define``, function, ``struct``, ``typedef``, +``enum`` and ``enum`` symbols and create cross-references for all of them. It is also capable of distinguish #define used for specifying a Linux ioctl. -The optional rules file contains a set of rules like: +The optional rules file contains a set of rules like:: ignore ioctl VIDIOC_ENUM_FMT replace ioctl VIDIOC_DQBUF vidioc_qbuf @@ -34,8 +34,8 @@ class ParseDataStructs: It is meant to allow having a more comprehensive documentation, where uAPI headers will create cross-reference links to the code. - It is capable of identifying defines, functions, structs, typedefs, - enums and enum symbols and create cross-references for all of them. + It is capable of identifying ``define``, function, ``struct``, ``typedef``, + ``enum`` and ``enum`` symbols and create cross-references for all of them. It is also capable of distinguish #define used for specifying a Linux ioctl. @@ -43,13 +43,13 @@ class ParseDataStructs: allows parsing an exception file. Such file contains a set of rules using the syntax below: - 1. Ignore rules: + 1. Ignore rules:: ignore ` Removes the symbol from reference generation. - 2. Replace rules: + 2. Replace rules:: replace @@ -58,22 +58,22 @@ class ParseDataStructs: - A simple symbol name; - A full Sphinx reference. - 3. Namespace rules + 3. Namespace rules:: namespace Sets C namespace to be used during cross-reference generation. Can be overridden by replace rules. - On ignore and replace rules, can be: - - ioctl: for defines that end with _IO*, e.g. ioctl definitions - - define: for other defines - - symbol: for symbols defined within enums; - - typedef: for typedefs; - - enum: for the name of a non-anonymous enum; - - struct: for structs. + On ignore and replace rules, ```` can be: + - ``ioctl``: for defines that end with ``_IO*``, e.g. ioctl definitions + - ``define``: for other defines + - ``symbol``: for symbols defined within enums; + - ``typedef``: for typedefs; + - ``enum``: for the name of a non-anonymous enum; + - ``struct``: for structs. - Examples: + Examples:: ignore define __LINUX_MEDIA_H ignore ioctl VIDIOC_ENUM_FMT @@ -83,13 +83,15 @@ class ParseDataStructs: namespace MC """ - # Parser regexes with multiple ways to capture enums and structs + #: Parser regex with multiple ways to capture enums. RE_ENUMS = [ re.compile(r"^\s*enum\s+([\w_]+)\s*\{"), re.compile(r"^\s*enum\s+([\w_]+)\s*$"), re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"), re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"), ] + + #: Parser regex with multiple ways to capture structs. RE_STRUCTS = [ re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"), re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"), @@ -97,11 +99,13 @@ class ParseDataStructs: re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"), ] - # FIXME: the original code was written a long time before Sphinx C + # NOTE: the original code was written a long time before Sphinx C # domain to have multiple namespaces. To avoid to much turn at the # existing hyperlinks, the code kept using "c:type" instead of the # right types. To change that, we need to change the types not only # here, but also at the uAPI media documentation. + + #: Dictionary containing C type identifiers to be transformed. DEF_SYMBOL_TYPES = { "ioctl": { "prefix": "\\ ", @@ -158,6 +162,10 @@ class ParseDataStructs: self.symbols[symbol_type] = {} def read_exceptions(self, fname: str): + """ + Read an optional exceptions file, used to override defaults. + """ + if not fname: return @@ -242,9 +250,9 @@ class ParseDataStructs: def store_type(self, ln, symbol_type: str, symbol: str, ref_name: str = None, replace_underscores: bool = True): """ - Stores a new symbol at self.symbols under symbol_type. + Store a new symbol at self.symbols under symbol_type. - By default, underscores are replaced by "-" + By default, underscores are replaced by ``-``. """ defs = self.DEF_SYMBOL_TYPES[symbol_type] @@ -276,12 +284,16 @@ class ParseDataStructs: self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln) def store_line(self, line): - """Stores a line at self.data, properly indented""" + """ + Store a line at self.data, properly indented. + """ line = " " + line.expandtabs() self.data += line.rstrip(" ") def parse_file(self, file_in: str, exceptions: str = None): - """Reads a C source file and get identifiers""" + """ + Read a C source file and get identifiers. + """ self.data = "" is_enum = False is_comment = False @@ -433,7 +445,7 @@ class ParseDataStructs: def gen_toc(self): """ - Create a list of symbols to be part of a TOC contents table + Create a list of symbols to be part of a TOC contents table. """ text = [] @@ -464,6 +476,10 @@ class ParseDataStructs: return "\n".join(text) def write_output(self, file_in: str, file_out: str, toc: bool): + """ + Write a ReST output file. + """ + title = os.path.basename(file_in) if toc: -- cgit v1.2.3 From 7ef684c9fdb336b1e102c014da2424c0240196c2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:16 +0100 Subject: docs: kdoc: enrich_formatter: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <55ec8b896fe00529d326859cd094230fb5a2cd30.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/enrich_formatter.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/enrich_formatter.py b/tools/lib/python/kdoc/enrich_formatter.py index bb171567a4ca..d1be4e5e1962 100644 --- a/tools/lib/python/kdoc/enrich_formatter.py +++ b/tools/lib/python/kdoc/enrich_formatter.py @@ -26,12 +26,16 @@ class EnrichFormatter(argparse.HelpFormatter): and how they're used at the __doc__ description. """ def __init__(self, *args, **kwargs): - """Initialize class and check if is TTY""" + """ + Initialize class and check if is TTY. + """ super().__init__(*args, **kwargs) self._tty = sys.stdout.isatty() def enrich_text(self, text): - """Handle ReST markups (currently, only ``foo``)""" + r""" + Handle ReST markups (currently, only \`\`text\`\` markups). + """ if self._tty and text: # Replace ``text`` with ANSI SGR (bold) return re.sub(r'\`\`(.+?)\`\`', @@ -39,12 +43,16 @@ class EnrichFormatter(argparse.HelpFormatter): return text def _fill_text(self, text, width, indent): - """Enrich descriptions with markups on it""" + """ + Enrich descriptions with markups on it. + """ enriched = self.enrich_text(text) return "\n".join(indent + line for line in enriched.splitlines()) def _format_usage(self, usage, actions, groups, prefix): - """Enrich positional arguments at usage: line""" + """ + Enrich positional arguments at usage: line. + """ prog = self._prog parts = [] @@ -63,7 +71,9 @@ class EnrichFormatter(argparse.HelpFormatter): return usage_text def _format_action_invocation(self, action): - """Enrich argument names""" + """ + Enrich argument names. + """ if not action.option_strings: return self.enrich_text(f"``{action.dest.upper()}``") -- cgit v1.2.3 From 33220c1fc10b2e53f0a79cdf6447fd7bf405a860 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:17 +0100 Subject: docs: kdoc: python_version: Improve docstrings and comments In preparation to document kernel-doc module, improve its documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <2153afaeb496e1bb8d3cc318fff26c3f99d99486.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/kdoc/python_version.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/kdoc/python_version.py b/tools/lib/python/kdoc/python_version.py index e83088013db2..4ddb7ead5f56 100644 --- a/tools/lib/python/kdoc/python_version.py +++ b/tools/lib/python/kdoc/python_version.py @@ -33,21 +33,31 @@ class PythonVersion: """ def __init__(self, version): - """Ïnitialize self.version tuple from a version string""" + """ + Ïnitialize self.version tuple from a version string. + """ self.version = self.parse_version(version) @staticmethod def parse_version(version): - """Convert a major.minor.patch version into a tuple""" + """ + Convert a major.minor.patch version into a tuple. + """ return tuple(int(x) for x in version.split(".")) @staticmethod def ver_str(version): - """Returns a version tuple as major.minor.patch""" + """ + Returns a version tuple as major.minor.patch. + """ return ".".join([str(x) for x in version]) @staticmethod def cmd_print(cmd, max_len=80): + """ + Outputs a command line, repecting maximum width. + """ + cmd_line = [] for w in cmd: @@ -66,7 +76,9 @@ class PythonVersion: return "\n ".join(cmd_line) def __str__(self): - """Returns a version tuple as major.minor.patch from self.version""" + """ + Return a version tuple as major.minor.patch from self.version. + """ return self.ver_str(self.version) @staticmethod -- cgit v1.2.3 From 66c3bf974d48f8e5c5f94148e1171b62bd80e26d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:20 +0100 Subject: docs: python: abi_parser: do some improvements at documentation Add documentation for two consts and ensure that all sentenses will end with a dot. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: --- tools/lib/python/abi/abi_parser.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/abi/abi_parser.py b/tools/lib/python/abi/abi_parser.py index 9b8db70067ef..d7bb20ef3acc 100644 --- a/tools/lib/python/abi/abi_parser.py +++ b/tools/lib/python/abi/abi_parser.py @@ -21,14 +21,17 @@ from abi.helpers import AbiDebug, ABI_DIR class AbiParser: - """Main class to parse ABI files""" + """Main class to parse ABI files.""" + #: Valid tags at Documentation/ABI. TAGS = r"(what|where|date|kernelversion|contact|description|users)" + + #: ABI elements that will auto-generate cross-references. XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)" def __init__(self, directory, logger=None, enable_lineno=False, show_warnings=True, debug=0): - """Stores arguments for the class and initialize class vars""" + """Stores arguments for the class and initialize class vars.""" self.directory = directory self.enable_lineno = enable_lineno @@ -65,7 +68,7 @@ class AbiParser: self.re_xref_node = re.compile(self.XREF) def warn(self, fdata, msg, extra=None): - """Displays a parse error if warning is enabled""" + """Displays a parse error if warning is enabled.""" if not self.show_warnings: return @@ -77,7 +80,7 @@ class AbiParser: self.log.warning(msg) def add_symbol(self, what, fname, ln=None, xref=None): - """Create a reference table describing where each 'what' is located""" + """Create a reference table describing where each 'what' is located.""" if what not in self.what_symbols: self.what_symbols[what] = {"file": {}} @@ -92,7 +95,7 @@ class AbiParser: self.what_symbols[what]["xref"] = xref def _parse_line(self, fdata, line): - """Parse a single line of an ABI file""" + """Parse a single line of an ABI file.""" new_what = False new_tag = False @@ -264,7 +267,7 @@ class AbiParser: self.warn(fdata, "Unexpected content", line) def parse_readme(self, nametag, fname): - """Parse ABI README file""" + """Parse ABI README file.""" nametag["what"] = ["Introduction"] nametag["path"] = "README" @@ -282,7 +285,7 @@ class AbiParser: nametag["description"] += line def parse_file(self, fname, path, basename): - """Parse a single file""" + """Parse a single file.""" ref = f"abi_file_{path}_{basename}" ref = self.re_unprintable.sub("_", ref).strip("_") @@ -348,7 +351,7 @@ class AbiParser: self.add_symbol(what=w, fname=fname, xref=fdata.key) def _parse_abi(self, root=None): - """Internal function to parse documentation ABI recursively""" + """Internal function to parse documentation ABI recursively.""" if not root: root = self.directory @@ -377,7 +380,7 @@ class AbiParser: self.parse_file(name, path, basename) def parse_abi(self, root=None): - """Parse documentation ABI""" + """Parse documentation ABI.""" self._parse_abi(root) @@ -385,7 +388,7 @@ class AbiParser: self.log.debug(pformat(self.data)) def desc_txt(self, desc): - """Print description as found inside ABI files""" + """Print description as found inside ABI files.""" desc = desc.strip(" \t\n") @@ -393,7 +396,7 @@ class AbiParser: def xref(self, fname): """ - Converts a Documentation/ABI + basename into a ReST cross-reference + Converts a Documentation/ABI + basename into a ReST cross-reference. """ xref = self.file_refs.get(fname) @@ -403,7 +406,7 @@ class AbiParser: return xref def desc_rst(self, desc): - """Enrich ReST output by creating cross-references""" + """Enrich ReST output by creating cross-references.""" # Remove title markups from the description # Having titles inside ABI files will only work if extra @@ -459,7 +462,7 @@ class AbiParser: def doc(self, output_in_txt=False, show_symbols=True, show_file=True, filter_path=None): - """Print ABI at stdout""" + """Print ABI at stdout.""" part = None for key, v in sorted(self.data.items(), @@ -549,7 +552,7 @@ class AbiParser: yield (msg, file_ref[0][0], ln) def check_issues(self): - """Warn about duplicated ABI entries""" + """Warn about duplicated ABI entries.""" for what, v in self.what_symbols.items(): files = v.get("file") @@ -575,7 +578,7 @@ class AbiParser: self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) def search_symbols(self, expr): - """ Searches for ABI symbols """ + """ Searches for ABI symbols.""" regex = re.compile(expr, re.I) -- cgit v1.2.3 From ff91637dece7f4e108f7a2e76bd7e1054d24f600 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:21 +0100 Subject: docs: python: abi_regex: do some improvements at documentation Add documentation for two consts and ensure that all sentenses will end with a dot. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <5419ad89a5042c1571198c2f055866674808579b.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/abi/abi_regex.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/abi/abi_regex.py b/tools/lib/python/abi/abi_regex.py index d5553206de3c..d0c5e3ede6b5 100644 --- a/tools/lib/python/abi/abi_regex.py +++ b/tools/lib/python/abi/abi_regex.py @@ -16,10 +16,22 @@ from abi.abi_parser import AbiParser from abi.helpers import AbiDebug class AbiRegex(AbiParser): - """Extends AbiParser to search ABI nodes with regular expressions""" + """ + Extends AbiParser to search ABI nodes with regular expressions. - # Escape only ASCII visible characters + There some optimizations here to allow a quick symbol search: + instead of trying to place all symbols altogether an doing linear + search which is very time consuming, create a tree with one depth, + grouping similar symbols altogether. + + Yet, sometimes a full search will be needed, so we have a special branch + on such group tree where other symbols are placed. + """ + + #: Escape only ASCII visible characters. escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])" + + #: Special group for other nodes. leave_others = "others" # Tuples with regular expressions to be compiled and replacement data @@ -88,13 +100,15 @@ class AbiRegex(AbiParser): # Recover plus characters (re.compile(r"\xf7"), "+"), ] + + #: Regex to check if the symbol name has a number on it. re_has_num = re.compile(r"\\d") - # Symbol name after escape_chars that are considered a devnode basename + #: Symbol name after escape_chars that are considered a devnode basename. re_symbol_name = re.compile(r"(\w|\\[\.\-\:])+$") - # List of popular group names to be skipped to minimize regex group size - # Use AbiDebug.SUBGROUP_SIZE to detect those + #: List of popular group names to be skipped to minimize regex group size + #: Use AbiDebug.SUBGROUP_SIZE to detect those. skip_names = set(["devices", "hwmon"]) def regex_append(self, what, new): @@ -148,7 +162,7 @@ class AbiRegex(AbiParser): def get_regexes(self, what): """ Given an ABI devnode, return a list of all regular expressions that - may match it, based on the sub-groups created by regex_append() + may match it, based on the sub-groups created by regex_append(). """ re_list = [] -- cgit v1.2.3 From a50c62d375a824046a7baa9cb03e5a7e8bf7c6c4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:22 +0100 Subject: docs: kabi: system_symbols: end docstring phrases with a dot Some docstring classes are not ending with a dot. Fix to make it more uniform. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: --- tools/lib/python/abi/system_symbols.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/abi/system_symbols.py b/tools/lib/python/abi/system_symbols.py index 4a2554da217b..7bbefd274ea2 100644 --- a/tools/lib/python/abi/system_symbols.py +++ b/tools/lib/python/abi/system_symbols.py @@ -18,11 +18,11 @@ from random import shuffle from abi.helpers import AbiDebug class SystemSymbols: - """Stores arguments for the class and initialize class vars""" + """Stores arguments for the class and initialize class vars.""" def graph_add_file(self, path, link=None): """ - add a file path to the sysfs graph stored at self.root + add a file path to the sysfs graph stored at self.root. """ if path in self.files: @@ -43,7 +43,7 @@ class SystemSymbols: self.files.add(path) def print_graph(self, root_prefix="", root=None, level=0): - """Prints a reference tree graph using UTF-8 characters""" + """Prints a reference tree graph using UTF-8 characters.""" if not root: root = self.root @@ -173,7 +173,7 @@ class SystemSymbols: self._walk(sysfs) def check_file(self, refs, found): - """Check missing ABI symbols for a given sysfs file""" + """Check missing ABI symbols for a given sysfs file.""" res_list = [] @@ -214,7 +214,7 @@ class SystemSymbols: return res_list def _ref_interactor(self, root): - """Recursive function to interact over the sysfs tree""" + """Recursive function to interact over the sysfs tree.""" for k, v in root.items(): if isinstance(v, dict): @@ -232,7 +232,7 @@ class SystemSymbols: def get_fileref(self, all_refs, chunk_size): - """Interactor to group refs into chunks""" + """Interactor to group refs into chunks.""" n = 0 refs = [] @@ -250,7 +250,7 @@ class SystemSymbols: def check_undefined_symbols(self, max_workers=None, chunk_size=50, found=None, dry_run=None): - """Seach ABI for sysfs symbols missing documentation""" + """Seach ABI for sysfs symbols missing documentation.""" self.abi.parse_abi() -- cgit v1.2.3 From 5c9ece0b02b219e8502f66b8d9636d511280126d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:23 +0100 Subject: docs: kabi: helpers: add helper for debug bits 7 and 8 The kabi logic supports 8 debug bits, but only 6 are currently documented. Document the remaining ones. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <60e99b9060396eac8621954d6b8a73af45df90fb.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/abi/helpers.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py index 639b23e4ca33..b8c8dfb1272e 100644 --- a/tools/lib/python/abi/helpers.py +++ b/tools/lib/python/abi/helpers.py @@ -35,4 +35,6 @@ DEBUG_HELP = """ 16 - enable debug for what to regex conversion 32 - enable debug for symbol regex subgroups 64 - enable debug for sysfs graph tree variable +128 - enable debug of search groups +256 - enable displaying refrence tree graphs for undefined symbols. """ -- cgit v1.2.3 From b713adadf8be2d75dd6cfb626aec143d7461b100 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:24 +0100 Subject: docs: kabi: helpers: add documentation for each "enum" value Ensure that kABI module documentation will describe each debug bit. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <3b118b157e52d757bf82fd74f03b0f4bd9e8b8f1.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/abi/helpers.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py index b8c8dfb1272e..2a378d780d3c 100644 --- a/tools/lib/python/abi/helpers.py +++ b/tools/lib/python/abi/helpers.py @@ -13,28 +13,28 @@ ABI_DIR = "Documentation/ABI/" class AbiDebug: """Debug levels""" - WHAT_PARSING = 1 - WHAT_OPEN = 2 - DUMP_ABI_STRUCTS = 4 - UNDEFINED = 8 - REGEX = 16 - SUBGROUP_MAP = 32 - SUBGROUP_DICT = 64 - SUBGROUP_SIZE = 128 - GRAPH = 256 - + WHAT_PARSING = 1 #: Enable debug parsing logic. + WHAT_OPEN = 2 #: Enable debug messages on file open. + DUMP_ABI_STRUCTS = 4 #: Enable debug for ABI parse data. + UNDEFINED = 8 #: Enable extra undefined symbol data. + REGEX = 16 #: Enable debug for what to regex conversion. + SUBGROUP_MAP = 32 #: Enable debug for symbol regex subgroups + SUBGROUP_DICT = 64 #: Enable debug for sysfs graph tree variable. + SUBGROUP_SIZE = 128 #: Enable debug of search groups. + GRAPH = 256 #: Display ref tree graph for undefined symbols. +#: Helper messages for each debug variable DEBUG_HELP = """ -1 - enable debug parsing logic -2 - enable debug messages on file open -4 - enable debug for ABI parse data -8 - enable extra debug information to identify troubles - with ABI symbols found at the local machine that - weren't found on ABI documentation (used only for - undefined subcommand) -16 - enable debug for what to regex conversion -32 - enable debug for symbol regex subgroups -64 - enable debug for sysfs graph tree variable +1 - enable debug parsing logic +2 - enable debug messages on file open +4 - enable debug for ABI parse data +8 - enable extra debug information to identify troubles + with ABI symbols found at the local machine that + weren't found on ABI documentation (used only for + undefined subcommand) +16 - enable debug for what to regex conversion +32 - enable debug for symbol regex subgroups +64 - enable debug for sysfs graph tree variable 128 - enable debug of search groups 256 - enable displaying refrence tree graphs for undefined symbols. """ -- cgit v1.2.3 From 8b85f614f3b68a8a58762c8f8defbcebf6f0282a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:26 +0100 Subject: docs: jobserver: do some documentation improvements Make Sphinx handle better jobserver class documentation Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <18a9c1406bdead680e3ee5768c97ae8b2138e8ea.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/jobserver.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index 616411087725..8da1973e5c87 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -11,20 +11,23 @@ Interacts with the POSIX jobserver during the Kernel build time. A "normal" jobserver task, like the one initiated by a make subrocess would do: - open read/write file descriptors to communicate with the job server; - - ask for one slot by calling: + - ask for one slot by calling:: + claim = os.read(reader, 1) - - when the job finshes, call: + + - when the job finshes, call:: + os.write(writer, b"+") # os.write(writer, claim) Here, the goal is different: This script aims to get the remaining number of slots available, using all of them to run a command which handle tasks in parallel. To to that, it has a loop that ends only after there are no slots left. It then increments the number by one, in order to allow a -call equivalent to make -j$((claim+1)), e.g. having a parent make creating +call equivalent to ``make -j$((claim+1))``, e.g. having a parent make creating $claim child to do the actual work. The end goal here is to keep the total number of build tasks under the -limit established by the initial make -j$n_proc call. +limit established by the initial ``make -j$n_proc`` call. See: https://www.gnu.org/software/make/manual/html_node/POSIX-Jobserver.html#POSIX-Jobserver @@ -40,13 +43,14 @@ class JobserverExec: Claim all slots from make using POSIX Jobserver. The main methods here are: + - open(): reserves all slots; - close(): method returns all used slots back to make; - - run(): executes a command setting PARALLELISM= + - run(): executes a command setting PARALLELISM=. """ def __init__(self): - """Initialize internal vars""" + """Initialize internal vars.""" self.claim = 0 self.jobs = b"" self.reader = None @@ -54,7 +58,7 @@ class JobserverExec: self.is_open = False def open(self): - """Reserve all available slots to be claimed later on""" + """Reserve all available slots to be claimed later on.""" if self.is_open: return @@ -118,7 +122,7 @@ class JobserverExec: self.is_open = True def close(self): - """Return all reserved slots to Jobserver""" + """Return all reserved slots to Jobserver.""" if not self.is_open: return -- cgit v1.2.3 From ef6aa110d8888a14dfb2e843794097263c45a06b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:28 +0100 Subject: docs: parse_features: make documentation more consistent Do some changes to: - add missing documentation strings to vars; - add a missing docstring; - ensure that phases will end with a period. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <3722f10361638561a5ced18cf4f409930c88270b.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/feat/parse_features.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/python/feat/parse_features.py b/tools/lib/python/feat/parse_features.py index b88c04d3e2fe..41a51d9d6f62 100755 --- a/tools/lib/python/feat/parse_features.py +++ b/tools/lib/python/feat/parse_features.py @@ -21,14 +21,25 @@ class ParseFeature: from it. """ + #: feature header string. h_name = "Feature" + + #: Kernel config header string. h_kconfig = "Kconfig" + + #: description header string. h_description = "Description" + + #: subsystem header string. h_subsys = "Subsystem" + + #: status header string. h_status = "Status" + + #: architecture header string. h_arch = "Architecture" - # Sort order for status. Others will be mapped at the end. + #: Sort order for status. Others will be mapped at the end. status_map = { "ok": 0, "TODO": 1, @@ -40,7 +51,7 @@ class ParseFeature: def __init__(self, prefix, debug=0, enable_fname=False): """ - Sets internal variables + Sets internal variables. """ self.prefix = prefix @@ -63,11 +74,13 @@ class ParseFeature: self.msg = "" def emit(self, msg="", end="\n"): + """Helper function to append a new message for feature output.""" + self.msg += msg + end def parse_error(self, fname, ln, msg, data=None): """ - Displays an error message, printing file name and line + Displays an error message, printing file name and line. """ if ln: @@ -82,7 +95,7 @@ class ParseFeature: print("", file=sys.stderr) def parse_feat_file(self, fname): - """Parses a single arch-support.txt feature file""" + """Parses a single arch-support.txt feature file.""" if os.path.isdir(fname): return @@ -204,7 +217,7 @@ class ParseFeature: self.max_size_arch_with_header = self.max_size_arch + len(self.h_arch) def parse(self): - """Parses all arch-support.txt feature files inside self.prefix""" + """Parses all arch-support.txt feature files inside self.prefix.""" path = os.path.expanduser(self.prefix) @@ -281,7 +294,7 @@ class ParseFeature: def output_feature(self, feat): """ - Output a feature on all architectures + Output a feature on all architectures. """ title = f"Feature {feat}" @@ -331,7 +344,7 @@ class ParseFeature: def matrix_lines(self, desc_size, max_size_status, header): """ - Helper function to split element tables at the output matrix + Helper function to split element tables at the output matrix. """ if header: -- cgit v1.2.3 From 40146bf7555e9c3480b1225dfe7a5306e1b58b13 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 21 Jan 2026 17:11:36 +0100 Subject: selftests: net: tests for add double tunneling GRO/GSO Create a simple, netns-based topology with double, nested UDP tunnels and perform TSO transfers on top. Explicitly enable GSO and/or GRO and check the skb layout consistency with different configuration allowing (or not) GSO frames to be delivered on the other end. The trickest part is account in a robust way the aggregated/unaggregated packets with double encapsulation: use a classic bpf filter for it. Signed-off-by: Paolo Abeni Reviewed-by: Petr Machata Link: https://patch.msgid.link/61f2c98ba0f73057c2d6f6cb62eb807abd90bf6b.1769011015.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 1 + tools/testing/selftests/net/double_udp_encap.sh | 393 ++++++++++++++++++++++++ 3 files changed, 395 insertions(+) create mode 100755 tools/testing/selftests/net/double_udp_encap.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ce9699092f50..33f56fcbde09 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -22,6 +22,7 @@ TEST_PROGS := \ cmsg_so_mark.sh \ cmsg_so_priority.sh \ cmsg_time.sh \ + double_udp_encap.sh \ drop_monitor_tests.sh \ fcnal-ipv4.sh \ fcnal-ipv6.sh \ diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index b84362b9b508..cd49b7dfe216 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -77,6 +77,7 @@ CONFIG_NET_DROP_MONITOR=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NETFILTER_XTABLES_LEGACY=y +CONFIG_NETFILTER_XT_MATCH_BPF=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_NETFILTER_XT_NAT=m diff --git a/tools/testing/selftests/net/double_udp_encap.sh b/tools/testing/selftests/net/double_udp_encap.sh new file mode 100755 index 000000000000..9aaf97cdf141 --- /dev/null +++ b/tools/testing/selftests/net/double_udp_encap.sh @@ -0,0 +1,393 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +# shellcheck disable=SC2155 # prefer RO variable over return value from cmd +readonly CLI="$(dirname "$(readlink -f "$0")")/../../../net/ynl/pyynl/cli.py" + +readonly SRC=1 +readonly DST=2 + +readonly NET_V4=192.168.1. +readonly NET_V6=2001:db8:: +readonly OL1_NET_V4=172.16.1. +readonly OL1_NET_V6=2001:db8:1:: +readonly OL2_NET_V4=172.16.2. +readonly OL2_NET_V6=2001:db8:2:: + +trap cleanup_all_ns EXIT + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_gnv_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r gnv_dev=$3 + local -r gnv_id=$4 + local opts=$5 + local gnv_json + local rem + + if is_ipv6 "$bm_rem_addr"; then + rem=remote6 + else + rem=remote + fi + + # add ynl opt separator, if needed + [ -n "$opts" ] && opts=", $opts" + + gnv_json="{ \"id\": $gnv_id, \"$rem\": \"$bm_rem_addr\"$opts }" + ip netns exec "$netns" "$CLI" --family rt-link --create --excl \ + --do newlink --json "{\"ifname\": \"$gnv_dev\", + \"linkinfo\": {\"kind\":\"geneve\", + \"data\": $gnv_json } }" > /dev/null + ip -n "$netns" link set dev "$gnv_dev" up +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r vxlan_dev=$3 + local -r vxlan_id=$4 + local -r opts_str=$5 + local oldifs + local -a opts + local opt + + # convert the arguments from yaml format + oldifs=$IFS + IFS=',' + for opt in $opts_str; do + local pattern='"port":' + + [ -n "$opt" ] || continue + + opts+=("${opt/$pattern*/dstport}" "${opt/$pattern/}") + done + IFS=$oldifs + [ ${#opts[@]} -gt 0 ] || opts+=("dstport" "4789") + + ip -n "$netns" link add "$vxlan_dev" type vxlan id "$vxlan_id" \ + remote "$bm_rem_addr" "${opts[@]}" + ip -n "$netns" link set dev "$vxlan_dev" up +} + +create_ns() { + local nested_opt='"port":6082' + local create_endpoint + local options="$1" + local feature + local dev + local id + local ns + + RET=0 + + # +-------------+ +-------------+ + # | NS_SRC | | NS_NST_DST | + # | | | | + # | gnv_nst1 | | gnv_nst2 | + # | + | | + | + # | | | | | | + # | + | | + | + # | gnv1 | | gnv2 | + # | + | | + | + # | | | | | | + # | + veth1 +--------+ veth2 + | + # | | | | + # +-------------+ +-------------+ + + setup_ns NS_SRC NS_DST + + # concatenate caller provided options and default one + [ -n "$2" ] && nested_opt="$nested_opt,$2" + + ip link add name "veth$SRC" netns "$NS_SRC" type veth \ + peer name "veth$DST" netns "$NS_DST" + case "$ENCAP" in + vxlan) + create_endpoint=create_vxlan_endpoint + dev=vx + ;; + geneve) + create_endpoint=create_gnv_endpoint + dev=gnv + ;; + esac + + id=1 + for ns in "${NS_LIST[@]}"; do + ip -n "$ns" link set dev "veth$id" up + + # ensure the sender can do large write just after 3whs + ip netns exec "$ns" \ + sysctl -qw net.ipv4.tcp_wmem="4096 4194304 4194304" + + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + if [ $FAMILY = "4" ]; then + ip -n "$ns" addr add dev "veth$id" "$NET_V4$id/24" + $create_endpoint "$ns" "$NET_V4$((3 - id))" \ + "$dev$id" 4 "$options" + ip -n "$ns" addr add dev "$dev$id" "$OL1_NET_V4$id/24" + + # nested tunnel devices + # pmtu can't be propagated to upper layer devices; + # need manual adjust + $create_endpoint "$ns" "$OL1_NET_V4$((3 - id))" \ + "$dev"_nst"$id" 40 "$nested_opt" + ip -n "$ns" addr add dev "$dev"_nst"$id" \ + "$OL2_NET_V4$id/24" + ip -n "$ns" link set dev "$dev"_nst"$id" mtu 1392 + else + ip -n "$ns" addr add dev "veth$id" "$NET_V6$id/64" \ + nodad + $create_endpoint "$ns" "$NET_V6$((3 - id))" \ + "$dev"6"$id" 6 "$options" + ip -n "$ns" addr add dev "$dev"6"$id" \ + "$OL1_NET_V6$id/64" nodad + + $create_endpoint "$ns" "$OL1_NET_V6$((3 - id))" \ + "$dev"6_nst"$id" 60 "$nested_opt" + ip -n "$ns" addr add dev "$dev"6_nst"$id" \ + "$OL2_NET_V6$id/64" nodad + ip -n "$ns" link set dev "$dev"6_nst"$id" mtu 1352 + fi + id=$((id+1)) + done + + # enable GRO heuristic on the veth peer and ensure UDP L4 over tunnel is + # actually segmented + for feature in tso tx-udp_tnl-segmentation; do + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" \ + "$feature" off 2>/dev/null + done +} + +create_ns_gso() { + local dev + + create_ns "$@" + if [ "$ENCAP" = "geneve" ]; then + dev=gnv + else + dev=vx + fi + [ "$FAMILY" = "6" ] && dev="$dev"6 + ip netns exec "$NS_SRC" ethtool -K "$dev$SRC" \ + tx-gso-partial on \ + tx-udp_tnl-segmentation on \ + tx-udp_tnl-csum-segmentation on +} + +create_ns_gso_gro() { + create_ns_gso "$@" + ip netns exec "$NS_DST" ethtool -K "veth$DST" gro on + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" tx off >/dev/null 2>&1 +} + +run_test() { + local -r dst=$NET$DST + local -r msg=$1 + local -r total_size=$2 + local -r encappkts=$3 + local inner_proto_offset=0 + local inner_maclen=14 + local rx_family="-4" + local ipt=iptables + local bpf_filter + local -a rx_args + local wire_pkts + local rcvpkts + local encl=8 + local dport + local pkts + local snd + + if [ $FAMILY = "6" ]; then + ipt=ip6tables + else + # rx program does not support '-6' and implies ipv6 usage by + # default + rx_args=("$rx_family") + fi + + # The received can only check fixed size packet + pkts=$((total_size / GSO_SIZE)) + if [ -n "$4" ]; then + wire_pkts=$4 + elif [ $((total_size % GSO_SIZE)) -eq 0 ]; then + wire_pkts=1 + rx_args+=("-l" "$GSO_SIZE") + else + wire_pkts=2 + pkts=$((pkts + 1)) + fi + + if [ "$ENCAP" = "geneve" ]; then + dport=6081 + else + dport=4789 + fi + + # Either: + # - IPv4, nested tunnel carries UDP over IPv4, with dport 6082, + # innermost is TCP over IPv4 on port 8000 + # - IPv6, nested tunnel carries UDP over IPv6, with dport 6082, + # innermost is TCP over IPv6 on port 8000 + # The nested tunnel port is 6082 and the nested encap len is 8 + # regardless of the encap type (no geneve opts). + # In inherit protocol mode there is no nested mac hdr and the nested + # l3 protocol type field belongs to the geneve hdr. + [ "$USE_HINT" = true ] && encl=16 + [ "$INHERIT" = true ] && inner_maclen=0 + [ "$INHERIT" = true ] && inner_proto_offset=-4 + local inner=$((inner_maclen+encl)) + local proto=$((inner_maclen+encl+inner_proto_offset)) + bpf_filter=$(nfbpf_compile "(ip && + ip[$((40+encl))] == 0x08 && ip[$((41+encl))] == 0x00 && + ip[$((51+encl))] == 0x11 && + ip[$((64+encl))] == 0x17 && ip[$((65+encl))] == 0xc2 && + ip[$((76+proto))] == 0x08 && ip[$((77+proto))] == 0x00 && + ip[$((87+inner))] == 0x6 && + ip[$((100+inner))] == 0x1f && ip[$((101+inner))] == 0x40) || + (ip6 && + ip6[$((60+encl))] == 0x86 && ip6[$((61+encl))] == 0xdd && + ip6[$((68+encl))] == 0x11 && + ip6[$((104+encl))] == 0x17 && ip6[$((105+encl))] == 0xc2 && + ip6[$((116+proto))] == 0x86 && ip6[$((117+proto))] == 0xdd && + ip6[$((124+inner))] == 0x6 && + ip6[$((160+inner))] == 0x1f && ip6[$((161+inner))] == 0x40)") + + # ignore shorts packet, to avoid arp/mld induced noise + ip netns exec "$NS_SRC" "$ipt" -A OUTPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" "$ipt" -A INPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" ./udpgso_bench_rx -C 2000 -t -R 100 \ + -n "$pkts" "${rx_args[@]}" & + local pid=$! + wait_local_port_listen "$NS_DST" 8000 tcp + ip netns exec "$NS_SRC" ./udpgso_bench_tx -"$FAMILY" -t -M 1 \ + -s "$total_size" -D "$dst" + local ret=$? + check_err "$ret" "client failure exit code $ret" + wait "$pid" + ret=$? + check_err "$ret" "sever failure exit code $ret" + + snd=$(ip netns exec "$NS_SRC" "$ipt"-save -c | + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$snd" = "$wire_pkts" ] + # shellcheck disable=SC2319 # known false positive + check_err $? "send $snd packets on the lowest link, expected $wire_pkts" + + rcvpkts=$(ip netns exec "$NS_DST" "$ipt"-save -c | \ + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$rcvpkts" = "$encappkts" ] + check_err $? "received $rcvpkts $ENCAP packets, expected $encappkts" + log_test "$msg" +} + +run_tests() { + for FAMILY in 4 6; do + NET=$OL2_NET_V4 + GSO_SIZE=1340 # 1392 - 20 - 32 + + if [ $FAMILY = 6 ]; then + NET=$OL2_NET_V6 + GSO_SIZE=1280 # 1352 - 40 - 32 + fi + + echo "IPv$FAMILY" + + unset USE_HINT + unset INHERIT + + # "geneve" must be last encap in list, so that later + # test cases will run on it + for ENCAP in "vxlan" "geneve"; do + create_ns + run_test "No GSO - $ENCAP" $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + + create_ns_gso + run_test "GSO without GRO - $ENCAP" $((GSO_SIZE * 4)) \ + 4 1 + cleanup_all_ns + + # IPv4 only test + [ $FAMILY = "4" ] || continue + create_ns_gso + ip netns exec "$NS_SRC" \ + sysctl -qw net.ipv4.ip_no_pmtu_disc=1 + run_test "GSO disable due to no fixedid - $ENCAP" \ + $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + done + + # GRO tests imply/require geneve encap, the only one providing + # GRO hints + create_ns_gso_gro + run_test "double tunnel GRO, no hints" $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + # hint option is expected for all the following tests in the RX + # path + USE_HINT=true + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO" $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' '"udp-csum":1' + run_test "double tunnel GRO - csum complete" $((GSO_SIZE * 4))\ + 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1' \ + '"udp-csum":0,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO - no nested csum" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-csum":1' + run_test "double tunnel GRO - nested csum, outer 0-csum, skip"\ + $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + INHERIT=true + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' \ + '"udp-csum":1,"inner-proto-inherit":1' + run_test "double tunnel GRO - nested inherit proto" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + unset INHERIT + + create_ns_gso_gro '"gro-hint":1' + run_test "double tunnel GRO - short last pkt" \ + $((GSO_SIZE * 4 + GSO_SIZE / 2)) 2 + cleanup_all_ns + done +} + +require_command nfbpf_compile +require_command jq + +# tcp retransmisions will break the accounting +xfail_on_slow run_tests +exit "$EXIT_STATUS" -- cgit v1.2.3 From e073c118db0217e1dab14eb0088e7c6a8bf9ef63 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:55 +0800 Subject: selftest: tun: Format tun.c existing code In preparation for adding new tests for GSO over UDP tunnels, apply consistently the kernel style to the existing code. Signed-off-by: Xu Du Link: https://patch.msgid.link/d797de1e5a3d215dd78cb46775772ef682bab60e.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 0efc67b0357a..128b0a5327d4 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -25,7 +25,7 @@ static int tun_attach(int fd, char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_ATTACH_QUEUE; - return ioctl(fd, TUNSETQUEUE, (void *) &ifr); + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } static int tun_detach(int fd, char *dev) @@ -36,7 +36,7 @@ static int tun_detach(int fd, char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_DETACH_QUEUE; - return ioctl(fd, TUNSETQUEUE, (void *) &ifr); + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } static int tun_alloc(char *dev) @@ -54,7 +54,7 @@ static int tun_alloc(char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_TAP | IFF_NAPI | IFF_MULTI_QUEUE; - err = ioctl(fd, TUNSETIFF, (void *) &ifr); + err = ioctl(fd, TUNSETIFF, (void *)&ifr); if (err < 0) { fprintf(stderr, "can't TUNSETIFF: %s\n", strerror(errno)); close(fd); @@ -67,9 +67,9 @@ static int tun_alloc(char *dev) static int tun_delete(char *dev) { struct { - struct nlmsghdr nh; + struct nlmsghdr nh; struct ifinfomsg ifm; - unsigned char data[64]; + unsigned char data[64]; } req; struct rtattr *rta; int ret, rtnl; @@ -127,31 +127,36 @@ FIXTURE_TEARDOWN(tun) close(self->fd2); } -TEST_F(tun, delete_detach_close) { +TEST_F(tun, delete_detach_close) +{ EXPECT_EQ(tun_delete(self->ifname), 0); EXPECT_EQ(tun_detach(self->fd, self->ifname), -1); EXPECT_EQ(errno, 22); } -TEST_F(tun, detach_delete_close) { +TEST_F(tun, detach_delete_close) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, detach_close_delete) { +TEST_F(tun, detach_close_delete) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); close(self->fd); self->fd = -1; EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, reattach_delete_close) { +TEST_F(tun, reattach_delete_close) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, reattach_close_delete) { +TEST_F(tun, reattach_close_delete) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); close(self->fd); -- cgit v1.2.3 From a942fcd72e9736a0c49b60160f29dc5bb01d6f89 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:56 +0800 Subject: selftest: tun: Introduce tuntap_helpers.h header for TUN/TAP testing Introduce rtnetlink manipulation and packet construction helpers that will simplify the later creation of more related test cases. This avoids duplicating logic across different test cases. This new header will contain: - YNL-based netlink management utilities. - Helpers for ip link, ip address, ip neighbor and ip route operations. - Packet construction and manipulation helpers. Signed-off-by: Xu Du Link: https://patch.msgid.link/91f905715c69c75f7bf72d43388921fde6c34989.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tuntap_helpers.h | 390 +++++++++++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 tools/testing/selftests/net/tuntap_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/net/tuntap_helpers.h b/tools/testing/selftests/net/tuntap_helpers.h new file mode 100644 index 000000000000..d6c0437136ec --- /dev/null +++ b/tools/testing/selftests/net/tuntap_helpers.h @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _TUNTAP_HELPERS_H +#define _TUNTAP_HELPERS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rt-route-user.h" +#include "rt-addr-user.h" +#include "rt-neigh-user.h" +#include "rt-link-user.h" + +#define GENEVE_HLEN 8 +#define PKT_DATA 0xCB +#define TUNTAP_DEFAULT_TTL 8 +#define TUNTAP_DEFAULT_IPID 1337 + +unsigned int if_nametoindex(const char *ifname); + +static inline int ip_addr_len(int family) +{ + return (family == AF_INET) ? sizeof(struct in_addr) : + sizeof(struct in6_addr); +} + +static inline void fill_ifaddr_msg(struct ifaddrmsg *ifam, int family, + int prefix, int flags, const char *dev) +{ + ifam->ifa_family = family; + ifam->ifa_prefixlen = prefix; + ifam->ifa_index = if_nametoindex(dev); + ifam->ifa_flags = flags; + ifam->ifa_scope = RT_SCOPE_UNIVERSE; +} + +static inline int ip_addr_add(const char *dev, int family, void *addr, + uint8_t prefix) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + int ifa_flags = IFA_F_PERMANENT | IFA_F_NODAD; + int ret = -1, ipalen = ip_addr_len(family); + struct rt_addr_newaddr_req *req; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_addr_family, NULL); + if (!ys) + return -1; + + req = rt_addr_newaddr_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_ifaddr_msg(&req->_hdr, family, prefix, ifa_flags, dev); + rt_addr_newaddr_req_set_nlflags(req, nl_flags); + rt_addr_newaddr_req_set_local(req, addr, ipalen); + + ret = rt_addr_newaddr(ys, req); + rt_addr_newaddr_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline void fill_neigh_req_header(struct ndmsg *ndm, int family, + int state, const char *dev) +{ + ndm->ndm_family = family; + ndm->ndm_ifindex = if_nametoindex(dev); + ndm->ndm_state = state; + ndm->ndm_flags = 0; + ndm->ndm_type = RTN_UNICAST; +} + +static inline int ip_neigh_add(const char *dev, int family, void *addr, + unsigned char *lladdr) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + int ret = -1, ipalen = ip_addr_len(family); + struct rt_neigh_newneigh_req *req; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_neigh_family, NULL); + if (!ys) + return -1; + + req = rt_neigh_newneigh_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_neigh_req_header(&req->_hdr, family, NUD_PERMANENT, dev); + rt_neigh_newneigh_req_set_nlflags(req, nl_flags); + rt_neigh_newneigh_req_set_dst(req, addr, ipalen); + rt_neigh_newneigh_req_set_lladdr(req, lladdr, ETH_ALEN); + rt_neigh_newneigh_req_set_ifindex(req, if_nametoindex(dev)); + + ret = rt_neigh_newneigh(ys, req); + rt_neigh_newneigh_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline void fill_route_req_header(struct rtmsg *rtm, int family, + int table) +{ + rtm->rtm_family = family; + rtm->rtm_table = table; +} + +static inline int +ip_route_get(const char *dev, int family, int table, void *dst, + void (*parse_rsp)(struct rt_route_getroute_rsp *rsp, void *out), + void *out) +{ + int ret = -1, ipalen = ip_addr_len(family); + struct rt_route_getroute_req *req; + struct rt_route_getroute_rsp *rsp; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_route_family, NULL); + if (!ys) + return -1; + + req = rt_route_getroute_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_route_req_header(&req->_hdr, family, table); + rt_route_getroute_req_set_nlflags(req, NLM_F_REQUEST); + rt_route_getroute_req_set_dst(req, dst, ipalen); + rt_route_getroute_req_set_oif(req, if_nametoindex(dev)); + + rsp = rt_route_getroute(ys, req); + if (!rsp) + goto err_rsp_get; + + ret = 0; + if (parse_rsp) + parse_rsp(rsp, out); + + rt_route_getroute_rsp_free(rsp); +err_rsp_get: + rt_route_getroute_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline int +ip_link_add(const char *dev, char *link_type, + int (*fill_link_attr)(struct rt_link_newlink_req *req, void *data), + void *data) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + struct rt_link_newlink_req *req; + struct ynl_sock *ys; + int ret = -1; + + ys = ynl_sock_create(&ynl_rt_link_family, NULL); + if (!ys) + return -1; + + req = rt_link_newlink_req_alloc(); + if (!req) + goto err_req_alloc; + + req->_hdr.ifi_flags = IFF_UP; + rt_link_newlink_req_set_nlflags(req, nl_flags); + rt_link_newlink_req_set_ifname(req, dev); + rt_link_newlink_req_set_linkinfo_kind(req, link_type); + + if (fill_link_attr && fill_link_attr(req, data) < 0) + goto err_attr_fill; + + ret = rt_link_newlink(ys, req); +err_attr_fill: + rt_link_newlink_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline int ip_link_del(const char *dev) +{ + struct rt_link_dellink_req *req; + struct ynl_sock *ys; + int ret = -1; + + ys = ynl_sock_create(&ynl_rt_link_family, NULL); + if (!ys) + return -1; + + req = rt_link_dellink_req_alloc(); + if (!req) + goto err_req_alloc; + + rt_link_dellink_req_set_nlflags(req, NLM_F_REQUEST); + rt_link_dellink_req_set_ifname(req, dev); + + ret = rt_link_dellink(ys, req); + rt_link_dellink_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline size_t build_eth(uint8_t *buf, uint16_t proto, unsigned char *src, + unsigned char *dest) +{ + struct ethhdr *eth = (struct ethhdr *)buf; + + eth->h_proto = htons(proto); + memcpy(eth->h_source, src, ETH_ALEN); + memcpy(eth->h_dest, dest, ETH_ALEN); + + return ETH_HLEN; +} + +static inline uint32_t add_csum(const uint8_t *buf, int len) +{ + uint16_t *sbuf = (uint16_t *)buf; + uint32_t sum = 0; + + while (len > 1) { + sum += *sbuf++; + len -= 2; + } + + if (len) + sum += *(uint8_t *)sbuf; + + return sum; +} + +static inline uint16_t finish_ip_csum(uint32_t sum) +{ + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + return ~((uint16_t)sum); +} + +static inline uint16_t build_ip_csum(const uint8_t *buf, int len, uint32_t sum) +{ + sum += add_csum(buf, len); + return finish_ip_csum(sum); +} + +static inline int build_ipv4_header(uint8_t *buf, uint8_t proto, + int payload_len, struct in_addr *src, + struct in_addr *dst) +{ + struct iphdr *iph = (struct iphdr *)buf; + + iph->ihl = 5; + iph->version = 4; + iph->ttl = TUNTAP_DEFAULT_TTL; + iph->tot_len = htons(sizeof(*iph) + payload_len); + iph->id = htons(TUNTAP_DEFAULT_IPID); + iph->protocol = proto; + iph->saddr = src->s_addr; + iph->daddr = dst->s_addr; + iph->check = build_ip_csum(buf, iph->ihl << 2, 0); + + return iph->ihl << 2; +} + +static inline void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield) +{ + uint16_t val, *ptr = (uint16_t *)ip6h; + + val = ntohs(*ptr); + val &= 0xF00F; + val |= ((uint16_t)dsfield) << 4; + *ptr = htons(val); +} + +static inline int build_ipv6_header(uint8_t *buf, uint8_t proto, + uint8_t dsfield, int payload_len, + struct in6_addr *src, struct in6_addr *dst) +{ + struct ipv6hdr *ip6h = (struct ipv6hdr *)buf; + + ip6h->version = 6; + ip6h->payload_len = htons(payload_len); + ip6h->nexthdr = proto; + ip6h->hop_limit = TUNTAP_DEFAULT_TTL; + ipv6_set_dsfield(ip6h, dsfield); + memcpy(&ip6h->saddr, src, sizeof(ip6h->saddr)); + memcpy(&ip6h->daddr, dst, sizeof(ip6h->daddr)); + + return sizeof(struct ipv6hdr); +} + +static inline int build_geneve_header(uint8_t *buf, uint32_t vni) +{ + uint16_t protocol = htons(ETH_P_TEB); + uint32_t geneve_vni = htonl((vni << 8) & 0xffffff00); + + memcpy(buf + 2, &protocol, 2); + memcpy(buf + 4, &geneve_vni, 4); + return GENEVE_HLEN; +} + +static inline int build_udp_header(uint8_t *buf, uint16_t sport, uint16_t dport, + int payload_len) +{ + struct udphdr *udph = (struct udphdr *)buf; + + udph->source = htons(sport); + udph->dest = htons(dport); + udph->len = htons(sizeof(*udph) + payload_len); + return sizeof(*udph); +} + +static inline void build_udp_packet_csum(uint8_t *buf, int family, + bool csum_off) +{ + struct udphdr *udph = (struct udphdr *)buf; + size_t ipalen = ip_addr_len(family); + uint32_t sum; + + /* No extension IPv4 and IPv6 headers addresses are the last fields */ + sum = add_csum(buf - 2 * ipalen, 2 * ipalen); + sum += htons(IPPROTO_UDP) + udph->len; + + if (!csum_off) + sum += add_csum(buf, udph->len); + + udph->check = finish_ip_csum(sum); +} + +static inline int build_udp_packet(uint8_t *buf, uint16_t sport, uint16_t dport, + int payload_len, int family, bool csum_off) +{ + struct udphdr *udph = (struct udphdr *)buf; + + build_udp_header(buf, sport, dport, payload_len); + memset(buf + sizeof(*udph), PKT_DATA, payload_len); + build_udp_packet_csum(buf, family, csum_off); + + return sizeof(*udph) + payload_len; +} + +static inline int build_virtio_net_hdr_v1_hash_tunnel(uint8_t *buf, bool is_tap, + int hdr_len, int gso_size, + int outer_family, + int inner_family) +{ + struct virtio_net_hdr_v1_hash_tunnel *vh_tunnel = (void *)buf; + struct virtio_net_hdr_v1 *vh = &vh_tunnel->hash_hdr.hdr; + int outer_iphlen, inner_iphlen, eth_hlen, gso_type; + + eth_hlen = is_tap ? ETH_HLEN : 0; + outer_iphlen = (outer_family == AF_INET) ? sizeof(struct iphdr) : + sizeof(struct ipv6hdr); + inner_iphlen = (inner_family == AF_INET) ? sizeof(struct iphdr) : + sizeof(struct ipv6hdr); + + vh_tunnel->outer_th_offset = eth_hlen + outer_iphlen; + vh_tunnel->inner_nh_offset = vh_tunnel->outer_th_offset + ETH_HLEN + + GENEVE_HLEN + sizeof(struct udphdr); + + vh->csum_start = vh_tunnel->inner_nh_offset + inner_iphlen; + vh->csum_offset = __builtin_offsetof(struct udphdr, check); + vh->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vh->hdr_len = hdr_len; + vh->gso_size = gso_size; + + if (gso_size) { + gso_type = outer_family == AF_INET ? + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 : + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; + vh->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4 | gso_type; + } + + return sizeof(struct virtio_net_hdr_v1_hash_tunnel); +} + +#endif /* _TUNTAP_HELPERS_H */ -- cgit v1.2.3 From 82cfdcfa201057861ec8a60ca9354d2c4c67bd68 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:57 +0800 Subject: selftest: tun: Refactor tun_delete to use tuntap_helpers The previous patch introduced common tuntap helpers to simplify tun test code. This patch refactors the tun_delete function to use these new helpers. Signed-off-by: Xu Du Link: https://patch.msgid.link/ecc7c0c2d75d87cb814e97579e731650339703ab.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 16 ++++++++++++--- tools/testing/selftests/net/tun.c | 39 ++---------------------------------- 2 files changed, 15 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 33f56fcbde09..afdea6d95bde 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -183,7 +183,6 @@ TEST_GEN_PROGS := \ tap \ tcp_port_share \ tls \ - tun \ # end of TEST_GEN_PROGS TEST_FILES := \ @@ -195,7 +194,11 @@ TEST_FILES := \ # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller -YNL_GEN_PROGS := netlink-dumps +YNL_GEN_PROGS := \ + netlink-dumps \ + tun \ +# end of YNL_GEN_PROGS + TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_GEN_PROGS += $(YNL_GEN_PROGS) @@ -206,7 +209,14 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk # YNL build -YNL_GENS := netdev +YNL_GENS := \ + netdev \ + rt-addr \ + rt-link \ + rt-neigh \ + rt-route \ +# end of YNL_GENS + include ynl.mk $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 128b0a5327d4..d9030bdd2e06 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -8,14 +8,12 @@ #include #include #include -#include #include -#include -#include #include #include #include "kselftest_harness.h" +#include "tuntap_helpers.h" static int tun_attach(int fd, char *dev) { @@ -66,40 +64,7 @@ static int tun_alloc(char *dev) static int tun_delete(char *dev) { - struct { - struct nlmsghdr nh; - struct ifinfomsg ifm; - unsigned char data[64]; - } req; - struct rtattr *rta; - int ret, rtnl; - - rtnl = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); - if (rtnl < 0) { - fprintf(stderr, "can't open rtnl: %s\n", strerror(errno)); - return 1; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_ALIGN(NLMSG_LENGTH(sizeof(req.ifm))); - req.nh.nlmsg_flags = NLM_F_REQUEST; - req.nh.nlmsg_type = RTM_DELLINK; - - req.ifm.ifi_family = AF_UNSPEC; - - rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); - rta->rta_type = IFLA_IFNAME; - rta->rta_len = RTA_LENGTH(IFNAMSIZ); - req.nh.nlmsg_len += rta->rta_len; - memcpy(RTA_DATA(rta), dev, IFNAMSIZ); - - ret = send(rtnl, &req, req.nh.nlmsg_len, 0); - if (ret < 0) - fprintf(stderr, "can't send: %s\n", strerror(errno)); - ret = (unsigned int)ret != req.nh.nlmsg_len; - - close(rtnl); - return ret; + return ip_link_del(dev); } FIXTURE(tun) -- cgit v1.2.3 From 24e59f26eef2c806a8dbb94859aaf7af28197148 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:58 +0800 Subject: selftest: tun: Add helpers for GSO over UDP tunnel In preparation for testing GSO over UDP tunnels, enhance the test infrastructure to support a more complex data path involving a TUN device and a GENEVE udp tunnel. This patch introduces a dedicated setup/teardown topology that creates both a GENEVE tunnel interface and a TUN interface. The TUN device acts as the VTEP (Virtual Tunnel Endpoint), allowing it to send and receive virtio-net packets. This setup effectively tests the kernel's data path for encapsulated traffic. Note that after adding a new address to the UDP tunnel, we need to wait a bit until the associated route is available. Additionally, a new data structure is defined to manage test parameters. This structure is designed to be extensible, allowing different test data and configurations to be easily added in subsequent patches. Signed-off-by: Xu Du Link: https://patch.msgid.link/b5787b8c269f43ce11e1756f1691cc7fd9a1e901.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 425 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index d9030bdd2e06..ec089355312b 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -15,6 +15,80 @@ #include "kselftest_harness.h" #include "tuntap_helpers.h" +static const char param_dev_geneve_name[] = "geneve1"; +static unsigned char param_hwaddr_outer_dst[] = { 0x00, 0xfe, 0x98, + 0x14, 0x22, 0x42 }; +static unsigned char param_hwaddr_outer_src[] = { 0x00, 0xfe, 0x98, + 0x94, 0xd2, 0x43 }; +static unsigned char param_hwaddr_inner_dst[] = { 0x00, 0xfe, 0x98, + 0x94, 0x22, 0xcc }; +static unsigned char param_hwaddr_inner_src[] = { 0x00, 0xfe, 0x98, + 0x94, 0xd2, 0xdd }; + +static struct in_addr param_ipaddr4_outer_dst = { + __constant_htonl(0xac100001), +}; + +static struct in_addr param_ipaddr4_outer_src = { + __constant_htonl(0xac100002), +}; + +static struct in_addr param_ipaddr4_inner_dst = { + __constant_htonl(0xac100101), +}; + +static struct in_addr param_ipaddr4_inner_src = { + __constant_htonl(0xac100102), +}; + +static struct in6_addr param_ipaddr6_outer_dst = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } }, +}; + +static struct in6_addr param_ipaddr6_outer_src = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, +}; + +static struct in6_addr param_ipaddr6_inner_dst = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } }, +}; + +static struct in6_addr param_ipaddr6_inner_src = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, +}; + +#define VN_ID 1 +#define VN_PORT 4789 +#define UDP_SRC_PORT 22 +#define UDP_DST_PORT 48878 +#define IPPREFIX_LEN 24 +#define IP6PREFIX_LEN 64 +#define TIMEOUT_SEC 10 +#define TIMEOUT_USEC 100000 +#define MAX_RETRIES 20 + +#define UDP_TUNNEL_GENEVE_4IN4 0x01 +#define UDP_TUNNEL_GENEVE_6IN4 0x02 +#define UDP_TUNNEL_GENEVE_4IN6 0x04 +#define UDP_TUNNEL_GENEVE_6IN6 0x08 + +#define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) +#define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) + +#define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel) + +struct geneve_setup_config { + int family; + union { + struct in_addr r4; + struct in6_addr r6; + } remote; + __be32 vnid; + __be16 vnport; + unsigned char hwaddr[6]; + uint8_t csum; +}; + static int tun_attach(int fd, char *dev) { struct ifreq ifr; @@ -67,6 +141,202 @@ static int tun_delete(char *dev) return ip_link_del(dev); } +static int tun_open(char *dev, const int flags, const int hdrlen, + const int features, const unsigned char *mac_addr) +{ + struct ifreq ifr = { 0 }; + int fd, sk = -1; + + fd = open("/dev/net/tun", O_RDWR); + if (fd < 0) { + perror("open"); + return -1; + } + + ifr.ifr_flags = flags; + if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) { + perror("ioctl(TUNSETIFF)"); + goto err; + } + strcpy(dev, ifr.ifr_name); + + if (hdrlen > 0) { + if (ioctl(fd, TUNSETVNETHDRSZ, &hdrlen) < 0) { + perror("ioctl(TUNSETVNETHDRSZ)"); + goto err; + } + } + + if (features) { + if (ioctl(fd, TUNSETOFFLOAD, features) < 0) { + perror("ioctl(TUNSETOFFLOAD)"); + goto err; + } + } + + sk = socket(PF_INET, SOCK_DGRAM, 0); + if (sk < 0) { + perror("socket"); + goto err; + } + + if (ioctl(sk, SIOCGIFFLAGS, &ifr) < 0) { + perror("ioctl(SIOCGIFFLAGS)"); + goto err; + } + + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + if (ioctl(sk, SIOCSIFFLAGS, &ifr) < 0) { + perror("ioctl(SIOCSIFFLAGS)"); + goto err; + } + + if (mac_addr && flags & IFF_TAP) { + ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; + memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETH_ALEN); + + if (ioctl(sk, SIOCSIFHWADDR, &ifr) < 0) { + perror("ioctl(SIOCSIFHWADDR)"); + goto err; + } + } + +out: + if (sk >= 0) + close(sk); + return fd; + +err: + close(fd); + fd = -1; + goto out; +} + +static size_t sockaddr_len(int family) +{ + return (family == AF_INET) ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6); +} + +static int geneve_fill_newlink(struct rt_link_newlink_req *req, void *data) +{ + struct geneve_setup_config *cfg = data; + +#define SET_GENEVE_REMOTE rt_link_newlink_req_set_linkinfo_data_geneve_remote +#define SET_GENEVE_REMOTE6 rt_link_newlink_req_set_linkinfo_data_geneve_remote6 + + rt_link_newlink_req_set_address(req, cfg->hwaddr, ETH_ALEN); + rt_link_newlink_req_set_linkinfo_data_geneve_id(req, cfg->vnid); + rt_link_newlink_req_set_linkinfo_data_geneve_port(req, cfg->vnport); + rt_link_newlink_req_set_linkinfo_data_geneve_udp_csum(req, cfg->csum); + + if (cfg->family == AF_INET) + SET_GENEVE_REMOTE(req, cfg->remote.r4.s_addr); + else + SET_GENEVE_REMOTE6(req, &cfg->remote.r6, + sizeof(cfg->remote.r6)); + + return 0; +} + +static int geneve_create(const char *dev, int family, void *remote, + void *hwaddr) +{ + struct geneve_setup_config geneve; + + memset(&geneve, 0, sizeof(geneve)); + geneve.vnid = VN_ID; + geneve.vnport = htons(VN_PORT); + geneve.csum = 1; + geneve.family = family; + if (family == AF_INET) + memcpy(&geneve.remote.r4, remote, sizeof(struct in_addr)); + else + memcpy(&geneve.remote.r6, remote, sizeof(struct in6_addr)); + memcpy(geneve.hwaddr, hwaddr, ETH_ALEN); + + return ip_link_add(dev, "geneve", geneve_fill_newlink, (void *)&geneve); +} + +static int set_pmtu_discover(int fd, bool is_ipv4) +{ + int level, name, val; + + if (is_ipv4) { + level = SOL_IP; + name = IP_MTU_DISCOVER; + val = IP_PMTUDISC_DO; + } else { + level = SOL_IPV6; + name = IPV6_MTU_DISCOVER; + val = IPV6_PMTUDISC_DO; + } + + return setsockopt(fd, level, name, &val, sizeof(val)); +} + +static int udp_socket_open(struct sockaddr_storage *ssa, bool do_frag, + bool do_connect, struct sockaddr_storage *dsa) +{ + struct timeval to = { .tv_sec = TIMEOUT_SEC }; + int fd, family = ssa->ss_family; + int salen = sockaddr_len(family); + + fd = socket(family, SOCK_DGRAM, 0); + if (fd < 0) + return -1; + + if (bind(fd, (struct sockaddr *)ssa, salen) < 0) { + perror("bind"); + goto err; + } + + if (do_connect && connect(fd, (struct sockaddr *)dsa, salen) < 0) { + perror("connect"); + goto err; + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)) < 0) { + perror("setsockopt(SO_RCVTIMEO)"); + goto err; + } + + if (!do_frag && set_pmtu_discover(fd, family == AF_INET) < 0) { + perror("set_pmtu_discover"); + goto err; + } + return fd; + +err: + close(fd); + return -1; +} + +static void parse_route_rsp(struct rt_route_getroute_rsp *rsp, void *rtm_type) +{ + *(uint8_t *)rtm_type = rsp->_hdr.rtm_type; +} + +static int ip_route_check(const char *intf, int family, void *addr) +{ + uint8_t rtm_type, table = RT_TABLE_LOCAL; + int retries = MAX_RETRIES; + + while (retries-- > 0) { + if (ip_route_get(intf, family, table, addr, parse_route_rsp, + &rtm_type) == 0 && + rtm_type == RTN_LOCAL) + break; + + usleep(TIMEOUT_USEC); + } + + if (retries < 0) + return -1; + + return 0; +} + FIXTURE(tun) { char ifname[IFNAMSIZ]; @@ -129,4 +399,159 @@ TEST_F(tun, reattach_close_delete) EXPECT_EQ(tun_delete(self->ifname), 0); } +FIXTURE(tun_vnet_udptnl) +{ + char ifname[IFNAMSIZ]; + int fd, sock; +}; + +FIXTURE_VARIANT(tun_vnet_udptnl) +{ + int tunnel_type; + bool is_tap; +}; + +/* clang-format off */ +#define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##udptnl) { \ + .tunnel_type = type, \ + .is_tap = true, \ + } +/* clang-format on */ + +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN6, 4in6); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN6, 6in6); + +static void assign_ifaddr_vars(int family, int is_outer, void **srcip, + void **dstip, void **srcmac, void **dstmac) +{ + if (is_outer) { + if (family == AF_INET) { + *srcip = (void *)¶m_ipaddr4_outer_src; + *dstip = (void *)¶m_ipaddr4_outer_dst; + } else { + *srcip = (void *)¶m_ipaddr6_outer_src; + *dstip = (void *)¶m_ipaddr6_outer_dst; + } + *srcmac = param_hwaddr_outer_src; + *dstmac = param_hwaddr_outer_dst; + } else { + if (family == AF_INET) { + *srcip = (void *)¶m_ipaddr4_inner_src; + *dstip = (void *)¶m_ipaddr4_inner_dst; + } else { + *srcip = (void *)¶m_ipaddr6_inner_src; + *dstip = (void *)¶m_ipaddr6_inner_dst; + } + *srcmac = param_hwaddr_inner_src; + *dstmac = param_hwaddr_inner_dst; + } +} + +static void assign_sockaddr_vars(int family, int is_outer, + struct sockaddr_storage *src, + struct sockaddr_storage *dst) +{ + src->ss_family = family; + dst->ss_family = family; + + if (family == AF_INET) { + struct sockaddr_in *s4 = (struct sockaddr_in *)src; + struct sockaddr_in *d4 = (struct sockaddr_in *)dst; + + s4->sin_addr = is_outer ? param_ipaddr4_outer_src : + param_ipaddr4_inner_src; + d4->sin_addr = is_outer ? param_ipaddr4_outer_dst : + param_ipaddr4_inner_dst; + if (!is_outer) { + s4->sin_port = htons(UDP_SRC_PORT); + d4->sin_port = htons(UDP_DST_PORT); + } + } else { + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *d6 = (struct sockaddr_in6 *)dst; + + s6->sin6_addr = is_outer ? param_ipaddr6_outer_src : + param_ipaddr6_inner_src; + d6->sin6_addr = is_outer ? param_ipaddr6_outer_dst : + param_ipaddr6_inner_dst; + if (!is_outer) { + s6->sin6_port = htons(UDP_SRC_PORT); + d6->sin6_port = htons(UDP_DST_PORT); + } + } +} + +FIXTURE_SETUP(tun_vnet_udptnl) +{ + int ret, family, prefix, flags, features; + int tunnel_type = variant->tunnel_type; + struct sockaddr_storage ssa, dsa; + void *sip, *dip, *smac, *dmac; + + flags = (variant->is_tap ? IFF_TAP : IFF_TUN) | IFF_VNET_HDR | + IFF_MULTI_QUEUE | IFF_NO_PI; + features = TUN_F_CSUM | TUN_F_UDP_TUNNEL_GSO | + TUN_F_UDP_TUNNEL_GSO_CSUM | TUN_F_USO4 | TUN_F_USO6; + self->fd = tun_open(self->ifname, flags, TUN_VNET_TNL_SIZE, features, + param_hwaddr_outer_src); + ASSERT_GE(self->fd, 0); + + family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : AF_INET6; + prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN; + assign_ifaddr_vars(family, 1, &sip, &dip, &smac, &dmac); + + ret = ip_addr_add(self->ifname, family, sip, prefix); + ASSERT_EQ(ret, 0); + ret = ip_neigh_add(self->ifname, family, dip, dmac); + ASSERT_EQ(ret, 0); + ret = ip_route_check(self->ifname, family, sip); + ASSERT_EQ(ret, 0); + + ret = geneve_create(param_dev_geneve_name, family, dip, + param_hwaddr_inner_src); + ASSERT_EQ(ret, 0); + + family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : AF_INET6; + prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN; + assign_ifaddr_vars(family, 0, &sip, &dip, &smac, &dmac); + + ret = ip_addr_add(param_dev_geneve_name, family, sip, prefix); + ASSERT_EQ(ret, 0); + ret = ip_neigh_add(param_dev_geneve_name, family, dip, dmac); + ASSERT_EQ(ret, 0); + ret = ip_route_check(param_dev_geneve_name, family, sip); + ASSERT_EQ(ret, 0); + + assign_sockaddr_vars(family, 0, &ssa, &dsa); + self->sock = udp_socket_open(&ssa, false, true, &dsa); + ASSERT_GE(self->sock, 0); +} + +FIXTURE_TEARDOWN(tun_vnet_udptnl) +{ + int ret; + + if (self->sock != -1) + close(self->sock); + + ret = ip_link_del(param_dev_geneve_name); + EXPECT_EQ(ret, 0); + + ret = tun_delete(self->ifname); + EXPECT_EQ(ret, 0); +} + +TEST_F(tun_vnet_udptnl, basic) +{ + int ret; + char cmd[256] = { 0 }; + + sprintf(cmd, "ip addr show %s > /dev/null 2>&1", param_dev_geneve_name); + ret = system(cmd); + ASSERT_EQ(ret, 0); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 400e658aa096cda99b37ce806ed63cfe894c9566 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:59 +0800 Subject: selftest: tun: Add test for sending gso packet into tun The test constructs a raw packet, prepends a virtio_net_hdr, and writes the result to the TUN device. This mimics the behavior of a vm forwarding a guest's packet to the host networking stack. Signed-off-by: Xu Du Link: https://patch.msgid.link/a988dbc9ca109e4f1f0b33858c5035bce8ebede3.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 144 +++++++++++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index ec089355312b..f0ddb5d37683 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -75,7 +75,34 @@ static struct in6_addr param_ipaddr6_inner_src = { #define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) #define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) +#define UDP_TUNNEL_GENEVE_4IN4_HDRLEN \ + (ETH_HLEN + 2 * sizeof(struct iphdr) + GENEVE_HLEN + \ + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_6IN6_HDRLEN \ + (ETH_HLEN + 2 * sizeof(struct ipv6hdr) + GENEVE_HLEN + \ + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_4IN6_HDRLEN \ + (ETH_HLEN + sizeof(struct iphdr) + sizeof(struct ipv6hdr) + \ + GENEVE_HLEN + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_6IN4_HDRLEN \ + (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct iphdr) + \ + GENEVE_HLEN + 2 * sizeof(struct udphdr)) + +#define UDP_TUNNEL_HDRLEN(type) \ + ((type) == UDP_TUNNEL_GENEVE_4IN4 ? UDP_TUNNEL_GENEVE_4IN4_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_6IN6 ? UDP_TUNNEL_GENEVE_6IN6_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_4IN6 ? UDP_TUNNEL_GENEVE_4IN6_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_6IN4 ? UDP_TUNNEL_GENEVE_6IN4_HDRLEN : \ + 0) + +#define UDP_TUNNEL_MSS(type) (ETH_DATA_LEN - UDP_TUNNEL_HDRLEN(type)) +#define UDP_TUNNEL_MAX(type, is_tap) \ + (ETH_MAX_MTU - UDP_TUNNEL_HDRLEN(type) - ((is_tap) ? ETH_HLEN : 0)) + #define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel) +#define MAX_VNET_TUNNEL_PACKET_SZ \ + (TUN_VNET_TNL_SIZE + ETH_HLEN + UDP_TUNNEL_GENEVE_6IN6_HDRLEN + \ + ETH_MAX_MTU) struct geneve_setup_config { int family; @@ -408,15 +435,23 @@ FIXTURE(tun_vnet_udptnl) FIXTURE_VARIANT(tun_vnet_udptnl) { int tunnel_type; - bool is_tap; + int gso_size; + int data_size; + int r_num_mss; + bool is_tap, no_gso; }; /* clang-format off */ #define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ - FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##udptnl) { \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) { \ + /* send a single MSS: fall back to no GSO */ \ .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ .is_tap = true, \ - } + .no_gso = true, \ + }; /* clang-format on */ TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); @@ -544,14 +579,105 @@ FIXTURE_TEARDOWN(tun_vnet_udptnl) EXPECT_EQ(ret, 0); } -TEST_F(tun_vnet_udptnl, basic) +static int build_gso_packet_into_tun(const FIXTURE_VARIANT(tun_vnet_udptnl) * + variant, + uint8_t *buf) { - int ret; - char cmd[256] = { 0 }; + int pktlen, hlen, proto, inner_family, outer_family; + int tunnel_type = variant->tunnel_type; + int payload_len = variant->data_size; + int gso_size = variant->gso_size; + uint8_t *outer_udph, *cur = buf; + void *sip, *dip, *smac, *dmac; + bool is_tap = variant->is_tap; - sprintf(cmd, "ip addr show %s > /dev/null 2>&1", param_dev_geneve_name); - ret = system(cmd); - ASSERT_EQ(ret, 0); + hlen = (is_tap ? ETH_HLEN : 0) + UDP_TUNNEL_HDRLEN(tunnel_type); + inner_family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : + AF_INET6; + outer_family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : + AF_INET6; + + cur += build_virtio_net_hdr_v1_hash_tunnel(cur, is_tap, hlen, gso_size, + outer_family, inner_family); + + pktlen = hlen + payload_len; + assign_ifaddr_vars(outer_family, 1, &sip, &dip, &smac, &dmac); + + if (is_tap) { + proto = outer_family == AF_INET ? ETH_P_IP : ETH_P_IPV6; + pktlen -= ETH_HLEN; + cur += build_eth(cur, proto, dmac, smac); + } + + if (outer_family == AF_INET) { + pktlen = pktlen - sizeof(struct iphdr); + cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip); + } else { + pktlen = pktlen - sizeof(struct ipv6hdr); + cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip); + } + + outer_udph = cur; + assign_ifaddr_vars(inner_family, 0, &sip, &dip, &smac, &dmac); + + pktlen -= sizeof(struct udphdr); + proto = inner_family == AF_INET ? ETH_P_IP : ETH_P_IPV6; + cur += build_udp_header(cur, UDP_SRC_PORT, VN_PORT, pktlen); + cur += build_geneve_header(cur, VN_ID); + cur += build_eth(cur, proto, dmac, smac); + + pktlen = sizeof(struct udphdr) + payload_len; + if (inner_family == AF_INET) + cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip); + else + cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip); + + cur += build_udp_packet(cur, UDP_DST_PORT, UDP_SRC_PORT, payload_len, + inner_family, false); + + build_udp_packet_csum(outer_udph, outer_family, false); + + return cur - buf; +} + +static int +receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * variant, + int *r_num_mss) +{ + uint8_t packet_buf[MAX_VNET_TUNNEL_PACKET_SZ]; + int len, total_len = 0, socket = self->sock; + int payload_len = variant->data_size; + + while (total_len < payload_len) { + len = recv(socket, packet_buf, sizeof(packet_buf), 0); + if (len <= 0) { + if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + perror("recv"); + break; + } + + (*r_num_mss)++; + total_len += len; + } + + return total_len; +} + +TEST_F(tun_vnet_udptnl, send_gso_packet) +{ + uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ]; + int r_num_mss = 0; + int ret, off; + + memset(pkt, 0, sizeof(pkt)); + off = build_gso_packet_into_tun(variant, pkt); + ret = write(self->fd, pkt, off); + ASSERT_EQ(ret, off); + + ret = receive_gso_packet_from_tunnel(self, variant, &r_num_mss); + ASSERT_EQ(ret, variant->data_size); + ASSERT_EQ(r_num_mss, variant->r_num_mss); } TEST_HARNESS_MAIN -- cgit v1.2.3 From 6bdd7ae6059ec3c19f4717387706f92cd8b715a6 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:05:00 +0800 Subject: selftest: tun: Add test for receiving gso packet from tun The test validate that GSO information are correctly exposed when reading packets from a TUN device. Signed-off-by: Xu Du Link: https://patch.msgid.link/fe75ac66466380490eba858eef50596a1bfbd071.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 194 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index f0ddb5d37683..628ae2caf6f4 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -364,6 +364,116 @@ static int ip_route_check(const char *intf, int family, void *addr) return 0; } +static int send_gso_udp_msg(int socket, struct sockaddr_storage *addr, + uint8_t *send_buf, int send_len, int gso_size) +{ + char control[CMSG_SPACE(sizeof(uint16_t))] = { 0 }; + int alen = sockaddr_len(addr->ss_family); + struct msghdr msg = { 0 }; + struct iovec iov = { 0 }; + int ret; + + iov.iov_base = send_buf; + iov.iov_len = send_len; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = addr; + msg.msg_namelen = alen; + + if (gso_size > 0) { + struct cmsghdr *cmsg; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_UDP; + cmsg->cmsg_type = UDP_SEGMENT; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint16_t)); + *(uint16_t *)CMSG_DATA(cmsg) = gso_size; + } + + ret = sendmsg(socket, &msg, 0); + if (ret < 0) + perror("sendmsg"); + + return ret; +} + +static int validate_hdrlen(uint8_t **cur, int *len, int x) +{ + if (*len < x) + return -1; + *cur += x; + *len -= x; + return 0; +} + +static int parse_udp_tunnel_vnet_packet(uint8_t *buf, int len, int tunnel_type, + bool is_tap) +{ + struct ipv6hdr *iph6; + struct udphdr *udph; + struct iphdr *iph4; + uint8_t *cur = buf; + + if (validate_hdrlen(&cur, &len, TUN_VNET_TNL_SIZE)) + return -1; + + if (is_tap) { + if (validate_hdrlen(&cur, &len, ETH_HLEN)) + return -1; + } + + if (tunnel_type & UDP_TUNNEL_OUTER_IPV4) { + iph4 = (struct iphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct iphdr))) + return -1; + if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP) + return -1; + } else { + iph6 = (struct ipv6hdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr))) + return -1; + if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP) + return -1; + } + + udph = (struct udphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct udphdr))) + return -1; + if (ntohs(udph->dest) != VN_PORT) + return -1; + + if (validate_hdrlen(&cur, &len, GENEVE_HLEN)) + return -1; + if (validate_hdrlen(&cur, &len, ETH_HLEN)) + return -1; + + if (tunnel_type & UDP_TUNNEL_INNER_IPV4) { + iph4 = (struct iphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct iphdr))) + return -1; + if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP) + return -1; + } else { + iph6 = (struct ipv6hdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr))) + return -1; + if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP) + return -1; + } + + udph = (struct udphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct udphdr))) + return -1; + if (ntohs(udph->dest) != UDP_DST_PORT) + return -1; + + return len; +} + FIXTURE(tun) { char ifname[IFNAMSIZ]; @@ -664,6 +774,68 @@ receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, return total_len; } +static int send_gso_packet_into_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * + variant) +{ + int family = (variant->tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : + AF_INET6; + uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ] = { 0 }; + int payload_len = variant->data_size; + int gso_size = variant->gso_size; + struct sockaddr_storage ssa, dsa; + + assign_sockaddr_vars(family, 0, &ssa, &dsa); + return send_gso_udp_msg(self->sock, &dsa, buf, payload_len, gso_size); +} + +static int +receive_gso_packet_from_tun(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * variant, + struct virtio_net_hdr_v1_hash_tunnel *vnet_hdr) +{ + struct timeval timeout = { .tv_sec = TIMEOUT_SEC }; + uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ]; + int tunnel_type = variant->tunnel_type; + int payload_len = variant->data_size; + bool is_tap = variant->is_tap; + int ret, len, total_len = 0; + int tun_fd = self->fd; + fd_set fdset; + + while (total_len < payload_len) { + FD_ZERO(&fdset); + FD_SET(tun_fd, &fdset); + + ret = select(tun_fd + 1, &fdset, NULL, NULL, &timeout); + if (ret <= 0) { + perror("select"); + break; + } + if (!FD_ISSET(tun_fd, &fdset)) + continue; + + len = read(tun_fd, buf, sizeof(buf)); + if (len <= 0) { + if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + perror("read"); + break; + } + + len = parse_udp_tunnel_vnet_packet(buf, len, tunnel_type, + is_tap); + if (len < 0) + continue; + + if (total_len == 0) + memcpy(vnet_hdr, buf, TUN_VNET_TNL_SIZE); + + total_len += len; + } + + return total_len; +} + TEST_F(tun_vnet_udptnl, send_gso_packet) { uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ]; @@ -680,4 +852,26 @@ TEST_F(tun_vnet_udptnl, send_gso_packet) ASSERT_EQ(r_num_mss, variant->r_num_mss); } +TEST_F(tun_vnet_udptnl, recv_gso_packet) +{ + struct virtio_net_hdr_v1_hash_tunnel vnet_hdr = { 0 }; + struct virtio_net_hdr_v1 *vh = &vnet_hdr.hash_hdr.hdr; + int ret, gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; + + ret = send_gso_packet_into_tunnel(self, variant); + ASSERT_EQ(ret, variant->data_size); + + memset(&vnet_hdr, 0, sizeof(vnet_hdr)); + ret = receive_gso_packet_from_tun(self, variant, &vnet_hdr); + ASSERT_EQ(ret, variant->data_size); + + if (!variant->no_gso) { + ASSERT_EQ(vh->gso_size, variant->gso_size); + gso_type |= (variant->tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? + (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4) : + (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6); + ASSERT_EQ(vh->gso_type, gso_type); + } +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 87db2fdef5a7c2ea8a404afd402800d48940d6b2 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:05:01 +0800 Subject: selftest: tun: Add test data for success and failure paths To improve the robustness and coverage of the TUN selftests, this patch expands the set of test data. Signed-off-by: Xu Du Link: https://patch.msgid.link/5054f3ad9f3dbfe33b827183fccc5efeb8fd0da7.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 115 +++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 628ae2caf6f4..8a5cd5cb5472 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -57,6 +57,10 @@ static struct in6_addr param_ipaddr6_inner_src = { { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, }; +#ifndef BIT +#define BIT(nr) (1UL << (nr)) +#endif + #define VN_ID 1 #define VN_PORT 4789 #define UDP_SRC_PORT 22 @@ -72,6 +76,8 @@ static struct in6_addr param_ipaddr6_inner_src = { #define UDP_TUNNEL_GENEVE_4IN6 0x04 #define UDP_TUNNEL_GENEVE_6IN6 0x08 +#define UDP_TUNNEL_MAX_SEGMENTS BIT(7) + #define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) #define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) @@ -553,6 +559,39 @@ FIXTURE_VARIANT(tun_vnet_udptnl) /* clang-format off */ #define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1byte) { \ + /* no GSO: send a single byte */ \ + .tunnel_type = type, \ + .data_size = 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1mss) { \ + /* no GSO: send a single MSS, fall back to no GSO */ \ + .tunnel_type = type, \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_gtmss) { \ + /* no GSO: send a single MSS + 1B: fail */ \ + .tunnel_type = type, \ + .data_size = UDP_TUNNEL_MSS(type) + 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1byte) { \ + /* GSO: send 1 byte, gso 1 byte, fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = 1, \ + .data_size = 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) { \ /* send a single MSS: fall back to no GSO */ \ .tunnel_type = type, \ @@ -561,8 +600,65 @@ FIXTURE_VARIANT(tun_vnet_udptnl) .r_num_mss = 1, \ .is_tap = true, \ .no_gso = true, \ - }; -/* clang-format on */ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_ltgso) { \ + /* data <= MSS < gso: will fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type) + 1, \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_gtgso) { \ + /* GSO: a single MSS + 1B */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type) + 1, \ + .r_num_mss = 2, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_2mss) { \ + /* no GSO: send exactly 2 MSS */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type) * 2, \ + .r_num_mss = 2, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxbytes) { \ + /* GSO: send max bytes */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MAX(type, true), \ + .r_num_mss = UDP_TUNNEL_MAX(type, true) / \ + UDP_TUNNEL_MSS(type) + 1, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_over_maxbytes) { \ + /* GSO: send oversize max bytes: fail */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = ETH_MAX_MTU, \ + .r_num_mss = ETH_MAX_MTU / UDP_TUNNEL_MSS(type) + 1, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxsegs) { \ + /* GSO: send max number of min sized segments */ \ + .tunnel_type = type, \ + .gso_size = 1, \ + .data_size = UDP_TUNNEL_MAX_SEGMENTS, \ + .r_num_mss = UDP_TUNNEL_MAX_SEGMENTS, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_5byte) { \ + /* GSO: send 5 bytes, gso 2 bytes */ \ + .tunnel_type = type, \ + .gso_size = 2, \ + .data_size = 5, \ + .r_num_mss = 3, \ + .is_tap = true, \ + } /* clang-format on */ TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4); @@ -874,4 +970,19 @@ TEST_F(tun_vnet_udptnl, recv_gso_packet) } } +XFAIL_ADD(tun_vnet_udptnl, 4in4_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_nogsosz_gtmss, recv_gso_packet); + +XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, send_gso_packet); + +XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet); + TEST_HARNESS_MAIN -- cgit v1.2.3 From d84f24c898864facc13412a58b78964f6a769d76 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:05 -0800 Subject: perf header: Fix memory leaks in process_cpu_domain_info() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do_read_string() returns a string in allocated memory, for some reason there was unused memory allocations and unnecessary strdups. Remove these and make the "perf annotate basic tests" leak sanitizer clean. Fixes: d40c68a49f69c9bd ("perf header: Support CPU DOMAIN relation info") Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: linux-arm-kernel@lists.infradead.org Cc: linux-csky@vger.kernel.org Cc: linux-riscv@lists.infradead.org Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Swapnil Sapkal Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 9a15dd4b7640..eefd1cd73b6a 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3634,6 +3634,7 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused if (!d_info) return -1; + assert(cd_map[cpu]->domains[domain] == NULL); cd_map[cpu]->domains[domain] = d_info; d_info->domain = domain; @@ -3642,30 +3643,20 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused if (!dname) return -1; - d_info->dname = zalloc(strlen(dname) + 1); - if (!d_info->dname) - return -1; - - d_info->dname = strdup(dname); + d_info->dname = dname; } cpumask = do_read_string(ff); if (!cpumask) return -1; - d_info->cpumask = zalloc(strlen(cpumask) + 1); - if (!d_info->cpumask) - return -1; - d_info->cpumask = strdup(cpumask); + d_info->cpumask = cpumask; cpulist = do_read_string(ff); if (!cpulist) return -1; - d_info->cpulist = zalloc(strlen(cpulist) + 1); - if (!d_info->cpulist) - return -1; - d_info->cpulist = strdup(cpulist); + d_info->cpulist = cpulist; } } -- cgit v1.2.3 From 00419892bac28bf148450d762bbff990a6bd5494 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:06 -0800 Subject: perf annotate: Fix args leak of map_symbol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit map_symbol__exit() needs calling on an annotate_args.ms, however, rather than introduce proper reference count handling to symbol__annotate() just switch to passing the map_symbol pointer parameter around, making the puts the caller's responsibility. Fix a number of cases to ensure the map in a map_symbol has a reference count increment and add the then necessary map_symbol_exits. Fixes: 56e144fe98260a0f ("perf mem_info: Add and use map_symbol__exit and addr_map_symbol__exit") Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: linux-arm-kernel@lists.infradead.org Cc: linux-csky@vger.kernel.org Cc: linux-riscv@lists.infradead.org Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/loongarch/annotate/instructions.c | 14 +++++---- tools/perf/arch/s390/annotate/instructions.c | 11 ++++--- tools/perf/util/annotate.c | 2 +- tools/perf/util/capstone.c | 14 ++++----- tools/perf/util/disasm.c | 36 +++++++++++++---------- tools/perf/util/disasm.h | 2 +- tools/perf/util/llvm.c | 6 ++-- 7 files changed, 47 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c index 70262d5f1444..1c3abb43c8d7 100644 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -10,9 +10,7 @@ static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, st { char *c, *endptr, *tok, *name; struct map *map = ms->map; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; + struct addr_map_symbol target; c = strchr(ops->raw, '#'); if (c++ == NULL) @@ -38,12 +36,16 @@ static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, st if (ops->target.name == NULL) return -1; - target.addr = map__objdump_2mem(map, ops->target.addr); + target = (struct addr_map_symbol) { + .ms = { .map = map__get(map), }, + .addr = map__objdump_2mem(map, ops->target.addr), + }; if (maps__find_ams(ms->maps, &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; + addr_map_symbol__exit(&target); return 0; } @@ -58,7 +60,7 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st struct map *map = ms->map; struct symbol *sym = ms->sym; struct addr_map_symbol target = { - .ms = { .map = map, }, + .ms = { .map = map__get(map), }, }; const char *c = strchr(ops->raw, '#'); u64 start, end; @@ -90,7 +92,7 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st } else { ops->target.offset_avail = false; } - + addr_map_symbol__exit(&target); return 0; } diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index c61193f1e096..626e6d2cbc81 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -6,9 +6,7 @@ static int s390_call__parse(struct arch *arch, struct ins_operands *ops, { char *endptr, *tok, *name; struct map *map = ms->map; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; + struct addr_map_symbol target; tok = strchr(ops->raw, ','); if (!tok) @@ -36,12 +34,17 @@ static int s390_call__parse(struct arch *arch, struct ins_operands *ops, if (ops->target.name == NULL) return -1; - target.addr = map__objdump_2mem(map, ops->target.addr); + + target = (struct addr_map_symbol) { + .ms = { .map = map__get(map), }, + .addr = map__objdump_2mem(map, ops->target.addr), + }; if (maps__find_ams(ms->maps, &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; + addr_map_symbol__exit(&target); return 0; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index cc7764455faf..791d60f97c23 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1031,7 +1031,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, return 0; args.arch = arch; - args.ms = *ms; + args.ms = ms; if (notes->src == NULL) { notes->src = annotated_source__new(); diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c index be5fd44b1f9d..2c7feab61b7b 100644 --- a/tools/perf/util/capstone.c +++ b/tools/perf/util/capstone.c @@ -143,7 +143,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, struct annotate_args *args, u64 addr) { int i; - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct symbol *sym; /* TODO: support more architectures */ @@ -222,7 +222,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused, { #ifdef HAVE_LIBCAPSTONE_SUPPORT struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct dso *dso = map__dso(map); u64 start = map__rip_2objdump(map, sym->start); u64 offset; @@ -256,7 +256,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused, args->line = disasm_buf; args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); if (dl == NULL) @@ -268,7 +268,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused, !strcmp(args->options->disassembler_style, "att")) disassembler_style = true; - if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0) + if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0) goto err; needs_cs_close = true; @@ -345,7 +345,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused, { #ifdef HAVE_LIBCAPSTONE_SUPPORT struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct dso *dso = map__dso(map); struct nscookie nsc; u64 start = map__rip_2objdump(map, sym->start); @@ -382,7 +382,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused, !strcmp(args->options->disassembler_style, "att")) disassembler_style = true; - if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0) + if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0) goto err; needs_cs_close = true; @@ -408,7 +408,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused, args->line = disasm_buf; args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); if (dl == NULL) diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 50b9433f3f8e..924429142631 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -269,9 +269,7 @@ static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_s { char *endptr, *tok, *name; struct map *map = ms->map; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; + struct addr_map_symbol target; ops->target.addr = strtoull(ops->raw, &endptr, 16); @@ -296,12 +294,16 @@ static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_s if (ops->target.name == NULL) return -1; find_target: - target.addr = map__objdump_2mem(map, ops->target.addr); + target = (struct addr_map_symbol) { + .ms = { .map = map__get(map), }, + .addr = map__objdump_2mem(map, ops->target.addr), + }; if (maps__find_ams(ms->maps, &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; + addr_map_symbol__exit(&target); return 0; indirect_call: @@ -366,7 +368,7 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s struct map *map = ms->map; struct symbol *sym = ms->sym; struct addr_map_symbol target = { - .ms = { .map = map, }, + .ms = { .map = map__get(map), }, }; const char *c = strchr(ops->raw, ','); u64 start, end; @@ -440,7 +442,7 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s } else { ops->target.offset_avail = false; } - + addr_map_symbol__exit(&target); return 0; } @@ -1046,7 +1048,7 @@ static size_t disasm_line_size(int nr) struct disasm_line *disasm_line__new(struct annotate_args *args) { struct disasm_line *dl = NULL; - struct annotation *notes = symbol__annotation(args->ms.sym); + struct annotation *notes = symbol__annotation(args->ms->sym); int nr = notes->src->nr_events; dl = zalloc(disasm_line_size(nr)); @@ -1064,7 +1066,7 @@ struct disasm_line *disasm_line__new(struct annotate_args *args) } else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) goto out_free_line; - disasm_line__init_ins(dl, args->arch, &args->ms); + disasm_line__init_ins(dl, args->arch, args->ms); } return dl; @@ -1119,7 +1121,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct annotate_args *args, char *parsed_line, int *line_nr, char **fileloc) { - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct annotation *notes = symbol__annotation(sym); struct disasm_line *dl; char *tmp; @@ -1151,7 +1153,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, args->line = parsed_line; args->line_nr = *line_nr; args->fileloc = *fileloc; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); (*line_nr)++; @@ -1169,12 +1171,14 @@ static int symbol__parse_objdump_line(struct symbol *sym, if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) { struct addr_map_symbol target = { .addr = dl->ops.target.addr, - .ms = { .map = map, }, + .ms = { .map = map__get(map), }, }; - if (!maps__find_ams(args->ms.maps, &target) && + if (!maps__find_ams(args->ms->maps, &target) && target.ms.sym->start == target.al_addr) dl->ops.target.sym = target.ms.sym; + + addr_map_symbol__exit(&target); } annotation_line__add(&dl->al, ¬es->src->source); @@ -1338,7 +1342,7 @@ static int symbol__disassemble_raw(char *filename, struct symbol *sym, struct annotate_args *args) { struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct dso *dso = map__dso(map); u64 start = map__rip_2objdump(map, sym->start); u64 end = map__rip_2objdump(map, sym->end); @@ -1375,7 +1379,7 @@ static int symbol__disassemble_raw(char *filename, struct symbol *sym, args->line = disasm_buf; args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); if (dl == NULL) @@ -1501,7 +1505,7 @@ static int symbol__disassemble_objdump(const char *filename, struct symbol *sym, struct annotate_args *args) { struct annotation_options *opts = &annotate_opts; - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct dso *dso = map__dso(map); char *command; FILE *file; @@ -1644,7 +1648,7 @@ out_free_command: int symbol__disassemble(struct symbol *sym, struct annotate_args *args) { struct annotation_options *options = args->options; - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct dso *dso = map__dso(map); char symfs_filename[PATH_MAX]; bool delete_extract = false; diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index d2cb555e4a3b..a3ea9d676281 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -97,7 +97,7 @@ struct ins_ops { struct annotate_args { struct arch *arch; - struct map_symbol ms; + struct map_symbol *ms; struct annotation_options *options; s64 offset; char *line; diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c index 2ebf1f5f65bf..4ada9a10bd93 100644 --- a/tools/perf/util/llvm.c +++ b/tools/perf/util/llvm.c @@ -118,7 +118,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym, { #ifdef HAVE_LIBLLVM_SUPPORT struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct dso *dso = map__dso(map); u64 start = map__rip_2objdump(map, sym->start); /* Malloc-ed buffer containing instructions read from disk. */ @@ -184,7 +184,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym, args->line = disasm_buf; args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); if (dl == NULL) @@ -242,7 +242,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym, &line_storage_len); args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; llvm_addr2line(filename, pc, &args->fileloc, (unsigned int *)&args->line_nr, false, NULL); -- cgit v1.2.3 From 6fdd2676db55b503c52dd3f1359b5c57f774ab75 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:07 -0800 Subject: perf maps: Fix reference count leak in maps__find_ams() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ams and so ams->ms.map is an in argument, however, it is also overwritten. As a map is reference counted, ensure a map__put() is done before overwriting it. Fixes: 42fd623b58dbcc48 ("perf maps: Get map before returning in maps__find") Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 8ccc46d515b6..4092211cff62 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -708,6 +708,7 @@ int maps__find_ams(struct maps *maps, struct addr_map_symbol *ams) if (ams->addr < map__start(ams->ms.map) || ams->addr >= map__end(ams->ms.map)) { if (maps == NULL) return -1; + map__put(ams->ms.map); ams->ms.map = maps__find(maps, ams->addr); if (ams->ms.map == NULL) return -1; -- cgit v1.2.3 From 57d26593a92fdeaca5adcbbb5362fa13d5dd7540 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:08 -0800 Subject: perf disasm: Constify use of 'struct arch' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'struct arch' holds variables that are read but not written, except during some initialization. Change most uses to be for a "const struct arch *" version to capture this immutability. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/annotate/instructions.c | 2 +- tools/perf/arch/loongarch/annotate/instructions.c | 12 ++++--- tools/perf/arch/s390/annotate/instructions.c | 7 ++-- tools/perf/arch/x86/annotate/instructions.c | 4 +-- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate-data.c | 2 +- tools/perf/util/annotate-data.h | 2 +- tools/perf/util/annotate.c | 28 +++++++-------- tools/perf/util/annotate.h | 10 +++--- tools/perf/util/disasm.c | 42 +++++++++++++---------- tools/perf/util/disasm.h | 14 ++++---- 11 files changed, 67 insertions(+), 58 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 16cb62d40bd9..5099fa36180d 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -10,7 +10,7 @@ struct arm64_annotate { jump_insn; }; -static int arm64_mov__parse(struct arch *arch __maybe_unused, +static int arm64_mov__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c index 1c3abb43c8d7..5ebfe629ea68 100644 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -5,8 +5,10 @@ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ -static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, - struct disasm_line *dl __maybe_unused) +static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) + { char *c, *endptr, *tok, *name; struct map *map = ms->map; @@ -54,8 +56,10 @@ static struct ins_ops loongarch_call_ops = { .scnprintf = call__scnprintf, }; -static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, - struct disasm_line *dl __maybe_unused) +static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) + { struct map *map = ms->map; struct symbol *sym = ms->sym; diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index 626e6d2cbc81..37c1b62641d8 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -1,8 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include -static int s390_call__parse(struct arch *arch, struct ins_operands *ops, - struct map_symbol *ms, struct disasm_line *dl __maybe_unused) +static int s390_call__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) { char *endptr, *tok, *name; struct map *map = ms->map; @@ -53,7 +54,7 @@ static struct ins_ops s390_call_ops = { .scnprintf = call__scnprintf, }; -static int s390_mov__parse(struct arch *arch __maybe_unused, +static int s390_mov__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 803f9351a3fb..24b388bacdae 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -119,7 +119,7 @@ static struct ins x86__instructions[] = { { .name = "xorps", .ops = &mov_ops, }, }; -static bool amd__ins_is_fused(struct arch *arch, const char *ins1, +static bool amd__ins_is_fused(const struct arch *arch, const char *ins1, const char *ins2) { if (strstr(ins2, "jmp")) @@ -142,7 +142,7 @@ static bool amd__ins_is_fused(struct arch *arch, const char *ins1, return false; } -static bool intel__ins_is_fused(struct arch *arch, const char *ins1, +static bool intel__ins_is_fused(const struct arch *arch, const char *ins1, const char *ins2) { if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp")) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 36aca8d6d003..3df61cd46652 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -30,7 +30,7 @@ struct annotate_browser { struct rb_root entries; struct rb_node *curr_hot; struct annotation_line *selection; - struct arch *arch; + const struct arch *arch; /* * perf top can delete hist_entry anytime. Callers should make sure * its lifetime. diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 07cf9c334be0..edfcd6e9df9c 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -160,7 +160,7 @@ bool has_reg_type(struct type_state *state, int reg) return (unsigned)reg < ARRAY_SIZE(state->regs); } -static void init_type_state(struct type_state *state, struct arch *arch) +static void init_type_state(struct type_state *state, const struct arch *arch) { memset(state, 0, sizeof(*state)); INIT_LIST_HEAD(&state->stack_vars); diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 869307c7f130..9b222869e42d 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -117,7 +117,7 @@ extern struct annotated_data_type canary_type; */ struct data_loc_info { /* These are input field, should be filled by caller */ - struct arch *arch; + const struct arch *arch; struct thread *thread; struct map_symbol *ms; u64 ip; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 791d60f97c23..132af2556aec 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -761,7 +761,7 @@ static int disasm_line__print(struct disasm_line *dl, u64 start, int addr_fmt_wi } static struct annotated_data_type * -__hist_entry__get_data_type(struct hist_entry *he, struct arch *arch, +__hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch, struct debuginfo *dbg, struct disasm_line *dl, int *type_offset); @@ -980,11 +980,11 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel) annotation__calc_percent(notes, evsel, symbol__size(sym)); } -int evsel__get_arch(struct evsel *evsel, struct arch **parch) +int evsel__get_arch(struct evsel *evsel, const struct arch **parch) { struct perf_env *env = evsel__env(evsel); const char *arch_name = perf_env__arch(env); - struct arch *arch; + const struct arch *arch; int err; if (!arch_name) { @@ -999,7 +999,7 @@ int evsel__get_arch(struct evsel *evsel, struct arch **parch) } if (arch->init) { - err = arch->init(arch, env ? env->cpuid : NULL); + err = arch->init((struct arch *)arch, env ? env->cpuid : NULL); if (err) { pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name); @@ -1010,14 +1010,14 @@ int evsel__get_arch(struct evsel *evsel, struct arch **parch) } int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, - struct arch **parch) + const struct arch **parch) { struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); struct annotate_args args = { .options = &annotate_opts, }; - struct arch *arch = NULL; + const struct arch *arch = NULL; int err, nr; err = evsel__get_arch(evsel, &arch); @@ -2204,7 +2204,7 @@ print_addr: } int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, - struct arch **parch) + const struct arch **parch) { struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); @@ -2457,7 +2457,7 @@ int annotate_check_args(void) * to revisit the format when it handles different architecture. * Fills @reg and @offset when return 0. */ -static int extract_reg_offset(struct arch *arch, const char *str, +static int extract_reg_offset(const struct arch *arch, const char *str, struct annotated_op_loc *op_loc) { char *p; @@ -2538,7 +2538,7 @@ static int extract_reg_offset(struct arch *arch, const char *str, * # dst_reg1 = rbx, dst_reg2 = rcx, dst_mem = 1 * # dst_multi_regs = 1, dst_offset = 8 */ -int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, +int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl, struct annotated_insn_loc *loc) { struct ins_operands *ops; @@ -2673,7 +2673,7 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head, return istat; } -static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) +static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl) { if (arch__is(arch, "x86")) { if (!strncmp(dl->ins.name, "push", 4) || @@ -2686,7 +2686,7 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) return false; } -static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) +static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *loc) { /* On x86_64, %gs:40 is used for stack canary */ if (arch__is(arch, "x86")) { @@ -2702,7 +2702,7 @@ static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) * Returns true if the instruction has a memory operand without * performing a load/store */ -static bool is_address_gen_insn(struct arch *arch, struct disasm_line *dl) +static bool is_address_gen_insn(const struct arch *arch, struct disasm_line *dl) { if (arch__is(arch, "x86")) { if (!strncmp(dl->ins.name, "lea", 3)) @@ -2791,7 +2791,7 @@ void debuginfo_cache__delete(void) } static struct annotated_data_type * -__hist_entry__get_data_type(struct hist_entry *he, struct arch *arch, +__hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch, struct debuginfo *dbg, struct disasm_line *dl, int *type_offset) { @@ -2895,7 +2895,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) { struct map_symbol *ms = &he->ms; struct evsel *evsel = hists_to_evsel(he->hists); - struct arch *arch; + const struct arch *arch; struct disasm_line *dl; struct annotated_data_type *mem_type; struct annotated_item_stat *istat; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d4990bff29a7..58eaf4b2fa65 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -202,7 +202,7 @@ struct annotation_write_ops { struct annotation_print_data { struct hist_entry *he; struct evsel *evsel; - struct arch *arch; + const struct arch *arch; struct debuginfo *dbg; /* save data type info keyed by al->offset */ struct hashmap *type_hash; @@ -441,10 +441,10 @@ void symbol__annotate_zero_histograms(struct symbol *sym); int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, - struct arch **parch); + const struct arch **parch); int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, - struct arch **parch); + const struct arch **parch); enum symbol_disassemble_errno { SYMBOL_ANNOTATE_ERRNO__SUCCESS = 0, @@ -546,7 +546,7 @@ struct annotated_insn_loc { i++, op_loc++) /* Get detailed location info in the instruction */ -int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, +int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl, struct annotated_insn_loc *loc); /* Returns a data type from the sample instruction (if any) */ @@ -586,5 +586,5 @@ int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr, int num_aggr, struct evsel *evsel); int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header); -int evsel__get_arch(struct evsel *evsel, struct arch **parch); +int evsel__get_arch(struct evsel *evsel, const struct arch **parch); #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 924429142631..d92c0424e8fc 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -213,7 +213,7 @@ static void arch__sort(void) qsort(architectures, nmemb, sizeof(struct arch), arch__cmp); } -struct arch *arch__find(const char *name) +const struct arch *arch__find(const char *name) { const int nmemb = ARRAY_SIZE(architectures); static bool sorted; @@ -226,7 +226,7 @@ struct arch *arch__find(const char *name) return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); } -bool arch__is(struct arch *arch, const char *name) +bool arch__is(const struct arch *arch, const char *name) { return !strcmp(arch->name, name); } @@ -256,7 +256,7 @@ static int ins__scnprintf(struct ins *ins, char *bf, size_t size, return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); } -bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) +bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2) { if (!arch || !arch->ins_is_fused) return false; @@ -264,7 +264,7 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) return arch->ins_is_fused(arch, ins1, ins2); } -static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, +static int call__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, struct disasm_line *dl __maybe_unused) { char *endptr, *tok, *name; @@ -362,7 +362,7 @@ static inline const char *validate_comma(const char *c, struct ins_operands *ops return c; } -static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, +static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, struct disasm_line *dl __maybe_unused) { struct map *map = ms->map; @@ -525,7 +525,7 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) return 0; } -static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, +static int lock__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, struct disasm_line *dl __maybe_unused) { ops->locked.ops = zalloc(sizeof(*ops->locked.ops)); @@ -592,7 +592,7 @@ static struct ins_ops lock_ops = { * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check * the input string after 'memory_ref_char' if exists. */ -static bool check_multi_regs(struct arch *arch, const char *op) +static bool check_multi_regs(const struct arch *arch, const char *op) { int count = 0; @@ -613,8 +613,9 @@ static bool check_multi_regs(struct arch *arch, const char *op) return count > 1; } -static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, - struct disasm_line *dl __maybe_unused) +static int mov__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) { char *s = strchr(ops->raw, ','), *target, *comment, prev; @@ -719,7 +720,7 @@ static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size, * - Add to Zero Extended XO-form ( Ex: addze, addzeo ) * - Subtract From Zero Extended XO-form ( Ex: subfze ) */ -static int arithmetic__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, +static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, struct disasm_line *dl) { int opcode = PPC_OP(dl->raw.raw_insn); @@ -756,7 +757,7 @@ static int load_store__scnprintf(struct ins *ins, char *bf, size_t size, * used by powerpc and since binary instruction code is used to * extract opcode, regs and offset, no other parsing is needed here */ -static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, +static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) { ops->source.mem_ref = true; @@ -776,8 +777,9 @@ static struct ins_ops load_store_ops = { .scnprintf = load_store__scnprintf, }; -static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, - struct disasm_line *dl __maybe_unused) +static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) { char *target, *comment, *s, prev; @@ -867,7 +869,8 @@ static void ins__sort(struct arch *arch) qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); } -static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct disasm_line *dl) +static struct ins_ops *__ins__find(const struct arch *arch, const char *name, + struct disasm_line *dl) { struct ins *ins; const int nmemb = arch->nr_instructions; @@ -885,8 +888,8 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct d } if (!arch->sorted_instructions) { - ins__sort(arch); - arch->sorted_instructions = true; + ins__sort((struct arch *)arch); + ((struct arch *)arch)->sorted_instructions = true; } ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); @@ -913,17 +916,18 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct d return ins ? ins->ops : NULL; } -struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl) +struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl) { struct ins_ops *ops = __ins__find(arch, name, dl); if (!ops && arch->associate_instruction_ops) - ops = arch->associate_instruction_ops(arch, name); + ops = arch->associate_instruction_ops((struct arch *)arch, name); return ops; } -static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) +static void disasm_line__init_ins(struct disasm_line *dl, const struct arch *arch, + struct map_symbol *ms) { dl->ins.ops = ins__find(arch, dl->ins.name, dl); diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index a3ea9d676281..273a9c906514 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -30,7 +30,7 @@ struct arch { unsigned int model; unsigned int family; int (*init)(struct arch *arch, char *cpuid); - bool (*ins_is_fused)(struct arch *arch, const char *ins1, + bool (*ins_is_fused)(const struct arch *arch, const char *ins1, const char *ins2); struct { char comment_char; @@ -89,14 +89,14 @@ struct ins_operands { struct ins_ops { void (*free)(struct ins_operands *ops); - int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + int (*parse)(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, struct disasm_line *dl); int (*scnprintf)(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); }; struct annotate_args { - struct arch *arch; + const struct arch *arch; struct map_symbol *ms; struct annotation_options *options; s64 offset; @@ -105,14 +105,14 @@ struct annotate_args { char *fileloc; }; -struct arch *arch__find(const char *name); -bool arch__is(struct arch *arch, const char *name); +const struct arch *arch__find(const char *name); +bool arch__is(const struct arch *arch, const char *name); -struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl); +struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl); bool ins__is_call(const struct ins *ins); bool ins__is_jump(const struct ins *ins); -bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2); +bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2); bool ins__is_ret(const struct ins *ins); bool ins__is_lock(const struct ins *ins); -- cgit v1.2.3 From 1e3b91d6c53e2b5e01424511d009b6405d8a4152 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:09 -0800 Subject: perf disasm: Constify use of 'struct ins_op' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'struct ins_op' holds variables to function pointers that are read but not written. Change uses to be for a "const struct ins_op *" version to capture this immutability. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/annotate/instructions.c | 4 +- tools/perf/arch/arm64/annotate/instructions.c | 6 +-- tools/perf/arch/csky/annotate/instructions.c | 6 +-- tools/perf/arch/loongarch/annotate/instructions.c | 8 ++-- tools/perf/arch/mips/annotate/instructions.c | 4 +- tools/perf/arch/powerpc/annotate/instructions.c | 6 +-- tools/perf/arch/riscv64/annotate/instructions.c | 4 +- tools/perf/arch/s390/annotate/instructions.c | 8 ++-- tools/perf/arch/sparc/annotate/instructions.c | 4 +- tools/perf/util/disasm.c | 46 +++++++++++------------ tools/perf/util/disasm.h | 6 +-- 11 files changed, 51 insertions(+), 51 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c index 5e667b0f5512..b997d127fedd 100644 --- a/tools/perf/arch/arm/annotate/instructions.c +++ b/tools/perf/arch/arm/annotate/instructions.c @@ -11,10 +11,10 @@ struct arm_annotate { jump_insn; }; -static struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name) +static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name) { struct arm_annotate *arm = arch->priv; - struct ins_ops *ops; + const struct ins_ops *ops; regmatch_t match[2]; if (!regexec(&arm->call_insn, name, 2, match, 0)) diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 5099fa36180d..363af2f55122 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -63,15 +63,15 @@ out_free_source: static int mov__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -static struct ins_ops arm64_mov_ops = { +static const struct ins_ops arm64_mov_ops = { .parse = arm64_mov__parse, .scnprintf = mov__scnprintf, }; -static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name) +static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name) { struct arm64_annotate *arm = arch->priv; - struct ins_ops *ops; + const struct ins_ops *ops; regmatch_t match[2]; if (!regexec(&arm->jump_insn, name, 2, match, 0)) diff --git a/tools/perf/arch/csky/annotate/instructions.c b/tools/perf/arch/csky/annotate/instructions.c index 14270311d215..4a55c84a320a 100644 --- a/tools/perf/arch/csky/annotate/instructions.c +++ b/tools/perf/arch/csky/annotate/instructions.c @@ -3,10 +3,10 @@ #include -static struct ins_ops *csky__associate_ins_ops(struct arch *arch, - const char *name) +static const struct ins_ops *csky__associate_ins_ops(struct arch *arch, + const char *name) { - struct ins_ops *ops = NULL; + const struct ins_ops *ops = NULL; /* catch all kind of jumps */ if (!strcmp(name, "bt") || diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c index 5ebfe629ea68..5010d5d58375 100644 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -51,7 +51,7 @@ static int loongarch_call__parse(const struct arch *arch, struct ins_operands *o return 0; } -static struct ins_ops loongarch_call_ops = { +static const struct ins_ops loongarch_call_ops = { .parse = loongarch_call__parse, .scnprintf = call__scnprintf, }; @@ -100,15 +100,15 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o return 0; } -static struct ins_ops loongarch_jump_ops = { +static const struct ins_ops loongarch_jump_ops = { .parse = loongarch_jump__parse, .scnprintf = jump__scnprintf, }; static -struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name) +const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name) { - struct ins_ops *ops = NULL; + const struct ins_ops *ops = NULL; if (!strcmp(name, "bl")) ops = &loongarch_call_ops; diff --git a/tools/perf/arch/mips/annotate/instructions.c b/tools/perf/arch/mips/annotate/instructions.c index b50b46c613d6..0fbe0a7df95a 100644 --- a/tools/perf/arch/mips/annotate/instructions.c +++ b/tools/perf/arch/mips/annotate/instructions.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 static -struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name) +const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name) { - struct ins_ops *ops = NULL; + const struct ins_ops *ops = NULL; if (!strncmp(name, "bal", 3) || !strncmp(name, "bgezal", 6) || diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index ca567cfdcbdb..d1be55425e35 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include -static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name) +static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name) { int i; - struct ins_ops *ops; + const struct ins_ops *ops; /* * - Interested only if instruction starts with 'b'. @@ -189,7 +189,7 @@ static int cmp_offset(const void *a, const void *b) return (val1->value - val2->value); } -static struct ins_ops *check_ppc_insn(struct disasm_line *dl) +static const struct ins_ops *check_ppc_insn(struct disasm_line *dl) { int raw_insn = dl->raw.raw_insn; int opcode = PPC_OP(raw_insn); diff --git a/tools/perf/arch/riscv64/annotate/instructions.c b/tools/perf/arch/riscv64/annotate/instructions.c index 55cf911633f8..a34798864fab 100644 --- a/tools/perf/arch/riscv64/annotate/instructions.c +++ b/tools/perf/arch/riscv64/annotate/instructions.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 static -struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name) +const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name) { - struct ins_ops *ops = NULL; + const struct ins_ops *ops = NULL; if (!strncmp(name, "jal", 3) || !strncmp(name, "jr", 2) || diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index 37c1b62641d8..1b22e6276e7d 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -49,7 +49,7 @@ static int s390_call__parse(const struct arch *arch, struct ins_operands *ops, return 0; } -static struct ins_ops s390_call_ops = { +static const struct ins_ops s390_call_ops = { .parse = s390_call__parse, .scnprintf = call__scnprintf, }; @@ -103,14 +103,14 @@ out_free_source: } -static struct ins_ops s390_mov_ops = { +static const struct ins_ops s390_mov_ops = { .parse = s390_mov__parse, .scnprintf = mov__scnprintf, }; -static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name) +static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name) { - struct ins_ops *ops = NULL; + const struct ins_ops *ops = NULL; /* catch all kind of jumps */ if (strchr(name, 'j') || diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/arch/sparc/annotate/instructions.c index 68c31580ccfc..a08d8734c883 100644 --- a/tools/perf/arch/sparc/annotate/instructions.c +++ b/tools/perf/arch/sparc/annotate/instructions.c @@ -117,9 +117,9 @@ static int is_branch_float_cond(const char *cond) return 0; } -static struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name) +static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name) { - struct ins_ops *ops = NULL; + const struct ins_ops *ops = NULL; if (!strcmp(name, "call") || !strcmp(name, "jmp") || diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index d92c0424e8fc..9bc9b1de98db 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -33,15 +33,15 @@ static regex_t file_lineno; /* These can be referred from the arch-dependent code */ -static struct ins_ops call_ops; -static struct ins_ops dec_ops; -static struct ins_ops jump_ops; -static struct ins_ops mov_ops; -static struct ins_ops nop_ops; -static struct ins_ops lock_ops; -static struct ins_ops ret_ops; -static struct ins_ops load_store_ops; -static struct ins_ops arithmetic_ops; +static const struct ins_ops call_ops; +static const struct ins_ops dec_ops; +static const struct ins_ops jump_ops; +static const struct ins_ops mov_ops; +static const struct ins_ops nop_ops; +static const struct ins_ops lock_ops; +static const struct ins_ops ret_ops; +static const struct ins_ops load_store_ops; +static const struct ins_ops arithmetic_ops; static int jump__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); @@ -85,7 +85,7 @@ grow_from_non_allocated_table: goto out_update_instructions; } -static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops) +static int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops) { struct ins *ins; @@ -334,7 +334,7 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size, return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr); } -static struct ins_ops call_ops = { +static const struct ins_ops call_ops = { .parse = call__parse, .scnprintf = call__scnprintf, }; @@ -487,7 +487,7 @@ static void jump__delete(struct ins_operands *ops __maybe_unused) */ } -static struct ins_ops jump_ops = { +static const struct ins_ops jump_ops = { .free = jump__delete, .parse = jump__parse, .scnprintf = jump__scnprintf, @@ -579,7 +579,7 @@ static void lock__delete(struct ins_operands *ops) zfree(&ops->target.name); } -static struct ins_ops lock_ops = { +static const struct ins_ops lock_ops = { .free = lock__delete, .parse = lock__parse, .scnprintf = lock__scnprintf, @@ -688,7 +688,7 @@ static int mov__scnprintf(struct ins *ins, char *bf, size_t size, ops->target.name ?: ops->target.raw); } -static struct ins_ops mov_ops = { +static const struct ins_ops mov_ops = { .parse = mov__parse, .scnprintf = mov__scnprintf, }; @@ -738,7 +738,7 @@ static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_ return 0; } -static struct ins_ops arithmetic_ops = { +static const struct ins_ops arithmetic_ops = { .parse = arithmetic__parse, .scnprintf = arithmetic__scnprintf, }; @@ -772,7 +772,7 @@ static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_ return 0; } -static struct ins_ops load_store_ops = { +static const struct ins_ops load_store_ops = { .parse = load_store__parse, .scnprintf = load_store__scnprintf, }; @@ -813,7 +813,7 @@ static int dec__scnprintf(struct ins *ins, char *bf, size_t size, ops->target.name ?: ops->target.raw); } -static struct ins_ops dec_ops = { +static const struct ins_ops dec_ops = { .parse = dec__parse, .scnprintf = dec__scnprintf, }; @@ -824,11 +824,11 @@ static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size, return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); } -static struct ins_ops nop_ops = { +static const struct ins_ops nop_ops = { .scnprintf = nop__scnprintf, }; -static struct ins_ops ret_ops = { +static const struct ins_ops ret_ops = { .scnprintf = ins__raw_scnprintf, }; @@ -869,7 +869,7 @@ static void ins__sort(struct arch *arch) qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); } -static struct ins_ops *__ins__find(const struct arch *arch, const char *name, +static const struct ins_ops *__ins__find(const struct arch *arch, const char *name, struct disasm_line *dl) { struct ins *ins; @@ -880,7 +880,7 @@ static struct ins_ops *__ins__find(const struct arch *arch, const char *name, * For powerpc, identify the instruction ops * from the opcode using raw_insn. */ - struct ins_ops *ops; + const struct ins_ops *ops; ops = check_ppc_insn(dl); if (ops) @@ -916,9 +916,9 @@ static struct ins_ops *__ins__find(const struct arch *arch, const char *name, return ins ? ins->ops : NULL; } -struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl) +const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl) { - struct ins_ops *ops = __ins__find(arch, name, dl); + const struct ins_ops *ops = __ins__find(arch, name, dl); if (!ops && arch->associate_instruction_ops) ops = arch->associate_instruction_ops((struct arch *)arch, name); diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 273a9c906514..dc5233f2a773 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -22,7 +22,7 @@ struct arch { struct ins *instructions; size_t nr_instructions; size_t nr_instructions_allocated; - struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); + const struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); bool sorted_instructions; bool initialized; const char *insn_suffix; @@ -52,7 +52,7 @@ struct arch { struct ins { const char *name; - struct ins_ops *ops; + const struct ins_ops *ops; }; struct ins_operands { @@ -108,7 +108,7 @@ struct annotate_args { const struct arch *arch__find(const char *name); bool arch__is(const struct arch *arch, const char *name); -struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl); +const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl); bool ins__is_call(const struct ins *ins); bool ins__is_jump(const struct ins *ins); -- cgit v1.2.3 From 2a1ca20d0b586d582e56fcb615b27045834d415a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:10 -0800 Subject: perf disasm: Constify use of 'struct ins' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'struct ins' holds variables that are read but not written, except during some initialization. Change most uses to be for a "const struct ins *" version to capture this immutability. So the x86__instructions can be const pre-sort it and make the sorted variable true. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/annotate/instructions.c | 2 +- tools/perf/arch/x86/annotate/instructions.c | 26 +++++++++++++++++----- tools/perf/util/disasm.c | 32 ++++++++++++++------------- tools/perf/util/disasm.h | 4 ++-- 4 files changed, 40 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 363af2f55122..44db33854dba 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -60,7 +60,7 @@ out_free_source: return -1; } -static int mov__scnprintf(struct ins *ins, char *bf, size_t size, +static int mov__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); static const struct ins_ops arm64_mov_ops = { diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 24b388bacdae..ffca3029388b 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -7,7 +7,7 @@ * So this table should not have entries with the suffix unless it's * a complete different instruction than ones without the suffix. */ -static struct ins x86__instructions[] = { +static const struct ins x86__instructions[] = { { .name = "adc", .ops = &mov_ops, }, { .name = "add", .ops = &mov_ops, }, { .name = "addsd", .ops = &mov_ops, }, @@ -19,9 +19,9 @@ static struct ins x86__instructions[] = { { .name = "btr", .ops = &mov_ops, }, { .name = "bts", .ops = &mov_ops, }, { .name = "call", .ops = &call_ops, }, + { .name = "cmovae", .ops = &mov_ops, }, { .name = "cmovbe", .ops = &mov_ops, }, { .name = "cmove", .ops = &mov_ops, }, - { .name = "cmovae", .ops = &mov_ops, }, { .name = "cmp", .ops = &mov_ops, }, { .name = "cmpxch", .ops = &mov_ops, }, { .name = "cmpxchg", .ops = &mov_ops, }, @@ -73,23 +73,23 @@ static struct ins x86__instructions[] = { { .name = "movaps", .ops = &mov_ops, }, { .name = "movdqa", .ops = &mov_ops, }, { .name = "movdqu", .ops = &mov_ops, }, + { .name = "movsb", .ops = &mov_ops, }, { .name = "movsd", .ops = &mov_ops, }, + { .name = "movsl", .ops = &mov_ops, }, { .name = "movss", .ops = &mov_ops, }, - { .name = "movsb", .ops = &mov_ops, }, { .name = "movsw", .ops = &mov_ops, }, - { .name = "movsl", .ops = &mov_ops, }, { .name = "movupd", .ops = &mov_ops, }, { .name = "movups", .ops = &mov_ops, }, { .name = "movzb", .ops = &mov_ops, }, - { .name = "movzw", .ops = &mov_ops, }, { .name = "movzl", .ops = &mov_ops, }, + { .name = "movzw", .ops = &mov_ops, }, { .name = "mulsd", .ops = &mov_ops, }, { .name = "mulss", .ops = &mov_ops, }, { .name = "nop", .ops = &nop_ops, }, { .name = "or", .ops = &mov_ops, }, { .name = "orps", .ops = &mov_ops, }, - { .name = "pand", .ops = &mov_ops, }, { .name = "paddq", .ops = &mov_ops, }, + { .name = "pand", .ops = &mov_ops, }, { .name = "pcmpeqb", .ops = &mov_ops, }, { .name = "por", .ops = &mov_ops, }, { .name = "rcl", .ops = &mov_ops, }, @@ -202,6 +202,20 @@ static int x86__annotate_init(struct arch *arch, char *cpuid) if (x86__cpuid_parse(arch, cpuid)) err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; } + +#ifndef NDEBUG + { + static bool sorted_check; + + if (!sorted_check) { + for (size_t i = 0; i < arch->nr_instructions - 1; i++) { + assert(strcmp(arch->instructions[i].name, + arch->instructions[i + 1].name) <= 0); + } + sorted_check = true; + } + } +#endif arch->e_machine = EM_X86_64; arch->e_flags = 0; arch->initialized = true; diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 9bc9b1de98db..2793697ce75c 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -43,9 +43,9 @@ static const struct ins_ops ret_ops; static const struct ins_ops load_store_ops; static const struct ins_ops arithmetic_ops; -static int jump__scnprintf(struct ins *ins, char *bf, size_t size, +static int jump__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -static int call__scnprintf(struct ins *ins, char *bf, size_t size, +static int call__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); static void ins__sort(struct arch *arch); @@ -66,7 +66,8 @@ static int arch__grow_instructions(struct arch *arch) goto grow_from_non_allocated_table; new_nr_allocated = arch->nr_instructions_allocated + 128; - new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins)); + new_instructions = realloc((void *)arch->instructions, + new_nr_allocated * sizeof(struct ins)); if (new_instructions == NULL) return -1; @@ -93,7 +94,7 @@ static int arch__associate_ins_ops(struct arch *arch, const char *name, const st arch__grow_instructions(arch)) return -1; - ins = &arch->instructions[arch->nr_instructions]; + ins = (struct ins *)&arch->instructions[arch->nr_instructions]; ins->name = strdup(name); if (!ins->name) return -1; @@ -146,6 +147,7 @@ static struct arch architectures[] = { .init = x86__annotate_init, .instructions = x86__instructions, .nr_instructions = ARRAY_SIZE(x86__instructions), + .sorted_instructions = true, .insn_suffix = "bwlq", .objdump = { .comment_char = '#', @@ -241,13 +243,13 @@ static void ins_ops__delete(struct ins_operands *ops) zfree(&ops->target.name); } -static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size, +static int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); } -static int ins__scnprintf(struct ins *ins, char *bf, size_t size, +static int ins__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { if (ins->ops->scnprintf) @@ -319,7 +321,7 @@ indirect_call: goto find_target; } -static int call__scnprintf(struct ins *ins, char *bf, size_t size, +static int call__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { if (ops->target.sym) @@ -446,7 +448,7 @@ static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct return 0; } -static int jump__scnprintf(struct ins *ins, char *bf, size_t size, +static int jump__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { const char *c; @@ -551,7 +553,7 @@ out_free_ops: return 0; } -static int lock__scnprintf(struct ins *ins, char *bf, size_t size, +static int lock__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { int printed; @@ -680,7 +682,7 @@ out_free_source: return -1; } -static int mov__scnprintf(struct ins *ins, char *bf, size_t size, +static int mov__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name, @@ -699,7 +701,7 @@ static const struct ins_ops mov_ops = { #define ADD_ZERO_EXT_XO_FORM 202 #define SUB_ZERO_EXT_XO_FORM 200 -static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size, +static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, @@ -743,7 +745,7 @@ static const struct ins_ops arithmetic_ops = { .scnprintf = arithmetic__scnprintf, }; -static int load_store__scnprintf(struct ins *ins, char *bf, size_t size, +static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, @@ -806,7 +808,7 @@ static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operand return 0; } -static int dec__scnprintf(struct ins *ins, char *bf, size_t size, +static int dec__scnprintf(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, @@ -818,7 +820,7 @@ static const struct ins_ops dec_ops = { .scnprintf = dec__scnprintf, }; -static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size, +static int nop__scnprintf(const struct ins *ins __maybe_unused, char *bf, size_t size, struct ins_operands *ops __maybe_unused, int max_ins_name) { return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); @@ -866,7 +868,7 @@ static void ins__sort(struct arch *arch) { const int nmemb = arch->nr_instructions; - qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); + qsort((void *)arch->instructions, nmemb, sizeof(struct ins), ins__cmp); } static const struct ins_ops *__ins__find(const struct arch *arch, const char *name, diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index dc5233f2a773..4f5c9a985786 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -19,7 +19,7 @@ struct disasm_line; struct arch { const char *name; - struct ins *instructions; + const struct ins *instructions; size_t nr_instructions; size_t nr_instructions_allocated; const struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); @@ -91,7 +91,7 @@ struct ins_ops { void (*free)(struct ins_operands *ops); int (*parse)(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, struct disasm_line *dl); - int (*scnprintf)(struct ins *ins, char *bf, size_t size, + int (*scnprintf)(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); }; -- cgit v1.2.3 From 9273085273103e5994952dc2725f1f0109af97d1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:11 -0800 Subject: perf disasm: Rework the string arch__is to use the ELF machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new arch__is_x86 and arch__is_powerpc functions that avoid string comparisons and use the ELF machine. Remove arch__is() that is no longer used. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 10 +++++----- tools/perf/util/annotate.c | 16 ++++++++-------- tools/perf/util/capstone.c | 2 +- tools/perf/util/disasm.c | 15 ++++++++++----- tools/perf/util/disasm.h | 3 ++- tools/perf/util/llvm.c | 2 +- 6 files changed, 27 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index edfcd6e9df9c..44fbd41e3845 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -165,7 +165,7 @@ static void init_type_state(struct type_state *state, const struct arch *arch) memset(state, 0, sizeof(*state)); INIT_LIST_HEAD(&state->stack_vars); - if (arch__is(arch, "x86")) { + if (arch__is_x86(arch)) { state->regs[0].caller_saved = true; state->regs[1].caller_saved = true; state->regs[2].caller_saved = true; @@ -526,7 +526,7 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, needs_pointer = false; else if (reg == dloc->fbreg || is_fbreg) needs_pointer = false; - else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP) + else if (arch__is_x86(dloc->arch) && reg == X86_REG_SP) needs_pointer = false; /* Get the type of the variable */ @@ -1071,7 +1071,7 @@ static void delete_var_types(struct die_var_type *var_types) /* should match to is_stack_canary() in util/annotate.c */ static void setup_stack_canary(struct data_loc_info *dloc) { - if (arch__is(dloc->arch, "x86")) { + if (arch__is_x86(dloc->arch)) { dloc->op->segment = INSN_SEG_X86_GS; dloc->op->imm = true; dloc->op->offset = 40; @@ -1311,7 +1311,7 @@ check_kernel: /* Direct this-cpu access like "%gs:0x34740" */ if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm && - arch__is(dloc->arch, "x86")) { + arch__is_x86(dloc->arch)) { pr_debug_dtp("this-cpu var"); addr = dloc->op->offset; @@ -1397,7 +1397,7 @@ out: static int arch_supports_insn_tracking(struct data_loc_info *dloc) { - if ((arch__is(dloc->arch, "x86")) || (arch__is(dloc->arch, "powerpc"))) + if ((arch__is_x86(dloc->arch)) || (arch__is_powerpc(dloc->arch))) return 1; return 0; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 132af2556aec..79702072568b 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2474,7 +2474,7 @@ static int extract_reg_offset(const struct arch *arch, const char *str, * %gs:0x18(%rbx). In that case it should skip the part. */ if (*str == arch->objdump.register_char) { - if (arch__is(arch, "x86")) { + if (arch__is_x86(arch)) { /* FIXME: Handle other segment registers */ if (!strncmp(str, "%gs:", 4)) op_loc->segment = INSN_SEG_X86_GS; @@ -2571,7 +2571,7 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl, op_loc->reg2 = -1; if (insn_str == NULL) { - if (!arch__is(arch, "powerpc")) + if (!arch__is_powerpc(arch)) continue; } @@ -2580,7 +2580,7 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl, * required fields for op_loc, ie reg1, reg2, offset from the * raw instruction. */ - if (arch__is(arch, "powerpc")) { + if (arch__is_powerpc(arch)) { op_loc->mem_ref = mem_ref; op_loc->multi_regs = multi_regs; get_powerpc_regs(dl->raw.raw_insn, !i, op_loc); @@ -2591,7 +2591,7 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl, } else { char *s, *p = NULL; - if (arch__is(arch, "x86")) { + if (arch__is_x86(arch)) { /* FIXME: Handle other segment registers */ if (!strncmp(insn_str, "%gs:", 4)) { op_loc->segment = INSN_SEG_X86_GS; @@ -2675,7 +2675,7 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head, static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl) { - if (arch__is(arch, "x86")) { + if (arch__is_x86(arch)) { if (!strncmp(dl->ins.name, "push", 4) || !strncmp(dl->ins.name, "pop", 3) || !strncmp(dl->ins.name, "call", 4) || @@ -2689,7 +2689,7 @@ static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl) static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *loc) { /* On x86_64, %gs:40 is used for stack canary */ - if (arch__is(arch, "x86")) { + if (arch__is_x86(arch)) { if (loc->segment == INSN_SEG_X86_GS && loc->imm && loc->offset == 40) return true; @@ -2704,7 +2704,7 @@ static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *lo */ static bool is_address_gen_insn(const struct arch *arch, struct disasm_line *dl) { - if (arch__is(arch, "x86")) { + if (arch__is_x86(arch)) { if (!strncmp(dl->ins.name, "lea", 3)) return true; } @@ -2847,7 +2847,7 @@ __hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch, } /* This CPU access in kernel - pretend PC-relative addressing */ - if (dso__kernel(map__dso(ms->map)) && arch__is(arch, "x86") && + if (dso__kernel(map__dso(ms->map)) && arch__is_x86(arch) && op_loc->segment == INSN_SEG_X86_GS && op_loc->imm) { dloc.var_addr = op_loc->offset; op_loc->reg1 = DWARF_REG_PC; diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c index 2c7feab61b7b..ce06cfd253ef 100644 --- a/tools/perf/util/capstone.c +++ b/tools/perf/util/capstone.c @@ -147,7 +147,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, struct symbol *sym; /* TODO: support more architectures */ - if (!arch__is(args->arch, "x86")) + if (!arch__is_x86(args->arch)) return; if (insn->detail == NULL) diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 2793697ce75c..b7523256c4ad 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -228,9 +228,14 @@ const struct arch *arch__find(const char *name) return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); } -bool arch__is(const struct arch *arch, const char *name) +bool arch__is_x86(const struct arch *arch) { - return !strcmp(arch->name, name); + return arch->e_machine == EM_386 || arch->e_machine == EM_X86_64; +} + +bool arch__is_powerpc(const struct arch *arch) +{ + return arch->e_machine == EM_PPC || arch->e_machine == EM_PPC64; } static void ins_ops__delete(struct ins_operands *ops) @@ -877,7 +882,7 @@ static const struct ins_ops *__ins__find(const struct arch *arch, const char *na struct ins *ins; const int nmemb = arch->nr_instructions; - if (arch__is(arch, "powerpc")) { + if (arch__is_powerpc(arch)) { /* * For powerpc, identify the instruction ops * from the opcode using raw_insn. @@ -1066,7 +1071,7 @@ struct disasm_line *disasm_line__new(struct annotate_args *args) goto out_delete; if (args->offset != -1) { - if (arch__is(args->arch, "powerpc")) { + if (arch__is_powerpc(args->arch)) { if (disasm_line__parse_powerpc(dl, args) < 0) goto out_free_line; } else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) @@ -1700,7 +1705,7 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) * and typeoff, disassemble to mnemonic notation is not required in * case of powerpc. */ - if (arch__is(args->arch, "powerpc")) { + if (arch__is_powerpc(args->arch)) { extern const char *sort_order; if (sort_order && !strstr(sort_order, "sym")) { diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 4f5c9a985786..db7f1ee3d8e7 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -106,7 +106,8 @@ struct annotate_args { }; const struct arch *arch__find(const char *name); -bool arch__is(const struct arch *arch, const char *name); +bool arch__is_x86(const struct arch *arch); +bool arch__is_powerpc(const struct arch *arch); const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl); diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c index 4ada9a10bd93..0d126d233c01 100644 --- a/tools/perf/util/llvm.c +++ b/tools/perf/util/llvm.c @@ -146,7 +146,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym, return errno; init_llvm(); - if (arch__is(args->arch, "x86")) { + if (arch__is_x86(args->arch)) { const char *triplet = is_64bit ? "x86_64-pc-linux" : "i686-pc-linux"; disasm = LLVMCreateDisasm(triplet, &storage, /*tag_type=*/0, -- cgit v1.2.3 From 07b972ff09f45cfb7acd20cd9b3769c6975bc434 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:12 -0800 Subject: perf disasm: Don't include C files from the arch directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the arch instructions.c files into appropriately named files in annotate-arch in the util directory. Don't #include to compile the code, switch to building the files and fix up the #includes accordingly. Move powerpc specific disasm code out of disasm.c and into annotate-powerpc.c. Declarations and static removed as appropriate for the code to compile as separate compilation units. The e_machine and e_flags set up is moved to the disasm.c architectures array so that later patches can sort by them. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arc/annotate/instructions.c | 11 - tools/perf/arch/arm/annotate/instructions.c | 66 -- tools/perf/arch/arm64/annotate/instructions.c | 126 ---- tools/perf/arch/csky/annotate/instructions.c | 53 -- tools/perf/arch/loongarch/annotate/instructions.c | 145 ---- tools/perf/arch/mips/annotate/instructions.c | 48 -- tools/perf/arch/powerpc/annotate/instructions.c | 317 -------- tools/perf/arch/riscv64/annotate/instructions.c | 36 - tools/perf/arch/s390/annotate/instructions.c | 178 ----- tools/perf/arch/sparc/annotate/instructions.c | 171 ----- tools/perf/arch/x86/annotate/instructions.c | 797 -------------------- tools/perf/util/Build | 1 + tools/perf/util/annotate-arch/Build | 11 + tools/perf/util/annotate-arch/annotate-arc.c | 10 + tools/perf/util/annotate-arch/annotate-arm.c | 65 ++ tools/perf/util/annotate-arch/annotate-arm64.c | 124 ++++ tools/perf/util/annotate-arch/annotate-csky.c | 48 ++ tools/perf/util/annotate-arch/annotate-loongarch.c | 148 ++++ tools/perf/util/annotate-arch/annotate-mips.c | 48 ++ tools/perf/util/annotate-arch/annotate-powerpc.c | 406 ++++++++++ tools/perf/util/annotate-arch/annotate-riscv64.c | 36 + tools/perf/util/annotate-arch/annotate-s390.c | 185 +++++ tools/perf/util/annotate-arch/annotate-sparc.c | 172 +++++ tools/perf/util/annotate-arch/annotate-x86.c | 820 +++++++++++++++++++++ tools/perf/util/disasm.c | 199 ++--- tools/perf/util/disasm.h | 38 + 26 files changed, 2155 insertions(+), 2104 deletions(-) delete mode 100644 tools/perf/arch/arc/annotate/instructions.c delete mode 100644 tools/perf/arch/arm/annotate/instructions.c delete mode 100644 tools/perf/arch/arm64/annotate/instructions.c delete mode 100644 tools/perf/arch/csky/annotate/instructions.c delete mode 100644 tools/perf/arch/loongarch/annotate/instructions.c delete mode 100644 tools/perf/arch/mips/annotate/instructions.c delete mode 100644 tools/perf/arch/powerpc/annotate/instructions.c delete mode 100644 tools/perf/arch/riscv64/annotate/instructions.c delete mode 100644 tools/perf/arch/s390/annotate/instructions.c delete mode 100644 tools/perf/arch/sparc/annotate/instructions.c delete mode 100644 tools/perf/arch/x86/annotate/instructions.c create mode 100644 tools/perf/util/annotate-arch/Build create mode 100644 tools/perf/util/annotate-arch/annotate-arc.c create mode 100644 tools/perf/util/annotate-arch/annotate-arm.c create mode 100644 tools/perf/util/annotate-arch/annotate-arm64.c create mode 100644 tools/perf/util/annotate-arch/annotate-csky.c create mode 100644 tools/perf/util/annotate-arch/annotate-loongarch.c create mode 100644 tools/perf/util/annotate-arch/annotate-mips.c create mode 100644 tools/perf/util/annotate-arch/annotate-powerpc.c create mode 100644 tools/perf/util/annotate-arch/annotate-riscv64.c create mode 100644 tools/perf/util/annotate-arch/annotate-s390.c create mode 100644 tools/perf/util/annotate-arch/annotate-sparc.c create mode 100644 tools/perf/util/annotate-arch/annotate-x86.c (limited to 'tools') diff --git a/tools/perf/arch/arc/annotate/instructions.c b/tools/perf/arch/arc/annotate/instructions.c deleted file mode 100644 index e5619770a1af..000000000000 --- a/tools/perf/arch/arc/annotate/instructions.c +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include - -static int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - arch->initialized = true; - arch->objdump.comment_char = ';'; - arch->e_machine = EM_ARC; - arch->e_flags = 0; - return 0; -} diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c deleted file mode 100644 index b997d127fedd..000000000000 --- a/tools/perf/arch/arm/annotate/instructions.c +++ /dev/null @@ -1,66 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -struct arm_annotate { - regex_t call_insn, - jump_insn; -}; - -static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name) -{ - struct arm_annotate *arm = arch->priv; - const struct ins_ops *ops; - regmatch_t match[2]; - - if (!regexec(&arm->call_insn, name, 2, match, 0)) - ops = &call_ops; - else if (!regexec(&arm->jump_insn, name, 2, match, 0)) - ops = &jump_ops; - else - return NULL; - - arch__associate_ins_ops(arch, name, ops); - return ops; -} - -static int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - struct arm_annotate *arm; - int err; - - if (arch->initialized) - return 0; - - arm = zalloc(sizeof(*arm)); - if (!arm) - return ENOMEM; - -#define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)" - err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED); - if (err) - goto out_free_arm; - err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED); - if (err) - goto out_free_call; -#undef ARM_CONDS - - arch->initialized = true; - arch->priv = arm; - arch->associate_instruction_ops = arm__associate_instruction_ops; - arch->objdump.comment_char = ';'; - arch->objdump.skip_functions_char = '+'; - arch->e_machine = EM_ARM; - arch->e_flags = 0; - return 0; - -out_free_call: - regfree(&arm->call_insn); -out_free_arm: - free(arm); - return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; -} diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c deleted file mode 100644 index 44db33854dba..000000000000 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - -struct arm64_annotate { - regex_t call_insn, - jump_insn; -}; - -static int arm64_mov__parse(const struct arch *arch __maybe_unused, - struct ins_operands *ops, - struct map_symbol *ms __maybe_unused, - struct disasm_line *dl __maybe_unused) -{ - char *s = strchr(ops->raw, ','), *target, *endptr; - - if (s == NULL) - return -1; - - *s = '\0'; - ops->source.raw = strdup(ops->raw); - *s = ','; - - if (ops->source.raw == NULL) - return -1; - - target = ++s; - ops->target.raw = strdup(target); - if (ops->target.raw == NULL) - goto out_free_source; - - ops->target.addr = strtoull(target, &endptr, 16); - if (endptr == target) - goto out_free_target; - - s = strchr(endptr, '<'); - if (s == NULL) - goto out_free_target; - endptr = strchr(s + 1, '>'); - if (endptr == NULL) - goto out_free_target; - - *endptr = '\0'; - *s = ' '; - ops->target.name = strdup(s); - *s = '<'; - *endptr = '>'; - if (ops->target.name == NULL) - goto out_free_target; - - return 0; - -out_free_target: - zfree(&ops->target.raw); -out_free_source: - zfree(&ops->source.raw); - return -1; -} - -static int mov__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); - -static const struct ins_ops arm64_mov_ops = { - .parse = arm64_mov__parse, - .scnprintf = mov__scnprintf, -}; - -static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name) -{ - struct arm64_annotate *arm = arch->priv; - const struct ins_ops *ops; - regmatch_t match[2]; - - if (!regexec(&arm->jump_insn, name, 2, match, 0)) - ops = &jump_ops; - else if (!regexec(&arm->call_insn, name, 2, match, 0)) - ops = &call_ops; - else if (!strcmp(name, "ret")) - ops = &ret_ops; - else - ops = &arm64_mov_ops; - - arch__associate_ins_ops(arch, name, ops); - return ops; -} - -static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - struct arm64_annotate *arm; - int err; - - if (arch->initialized) - return 0; - - arm = zalloc(sizeof(*arm)); - if (!arm) - return ENOMEM; - - /* bl, blr */ - err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED); - if (err) - goto out_free_arm; - /* b, b.cond, br, cbz/cbnz, tbz/tbnz */ - err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$", - REG_EXTENDED); - if (err) - goto out_free_call; - - arch->initialized = true; - arch->priv = arm; - arch->associate_instruction_ops = arm64__associate_instruction_ops; - arch->objdump.comment_char = '/'; - arch->objdump.skip_functions_char = '+'; - arch->e_machine = EM_AARCH64; - arch->e_flags = 0; - return 0; - -out_free_call: - regfree(&arm->call_insn); -out_free_arm: - free(arm); - return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; -} diff --git a/tools/perf/arch/csky/annotate/instructions.c b/tools/perf/arch/csky/annotate/instructions.c deleted file mode 100644 index 4a55c84a320a..000000000000 --- a/tools/perf/arch/csky/annotate/instructions.c +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. - -#include - -static const struct ins_ops *csky__associate_ins_ops(struct arch *arch, - const char *name) -{ - const struct ins_ops *ops = NULL; - - /* catch all kind of jumps */ - if (!strcmp(name, "bt") || - !strcmp(name, "bf") || - !strcmp(name, "bez") || - !strcmp(name, "bnez") || - !strcmp(name, "bnezad") || - !strcmp(name, "bhsz") || - !strcmp(name, "bhz") || - !strcmp(name, "blsz") || - !strcmp(name, "blz") || - !strcmp(name, "br") || - !strcmp(name, "jmpi") || - !strcmp(name, "jmp")) - ops = &jump_ops; - - /* catch function call */ - if (!strcmp(name, "bsr") || - !strcmp(name, "jsri") || - !strcmp(name, "jsr")) - ops = &call_ops; - - /* catch function return */ - if (!strcmp(name, "rts")) - ops = &ret_ops; - - if (ops) - arch__associate_ins_ops(arch, name, ops); - return ops; -} - -static int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - arch->initialized = true; - arch->objdump.comment_char = '/'; - arch->associate_instruction_ops = csky__associate_ins_ops; - arch->e_machine = EM_CSKY; -#if defined(__CSKYABIV2__) - arch->e_flags = EF_CSKY_ABIV2; -#else - arch->e_flags = EF_CSKY_ABIV1; -#endif - return 0; -} diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c deleted file mode 100644 index 5010d5d58375..000000000000 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ /dev/null @@ -1,145 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Perf annotate functions. - * - * Copyright (C) 2020-2023 Loongson Technology Corporation Limited - */ - -static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops, - struct map_symbol *ms, - struct disasm_line *dl __maybe_unused) - -{ - char *c, *endptr, *tok, *name; - struct map *map = ms->map; - struct addr_map_symbol target; - - c = strchr(ops->raw, '#'); - if (c++ == NULL) - return -1; - - ops->target.addr = strtoull(c, &endptr, 16); - - name = strchr(endptr, '<'); - name++; - - if (arch->objdump.skip_functions_char && - strchr(name, arch->objdump.skip_functions_char)) - return -1; - - tok = strchr(name, '>'); - if (tok == NULL) - return -1; - - *tok = '\0'; - ops->target.name = strdup(name); - *tok = '>'; - - if (ops->target.name == NULL) - return -1; - - target = (struct addr_map_symbol) { - .ms = { .map = map__get(map), }, - .addr = map__objdump_2mem(map, ops->target.addr), - }; - - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - addr_map_symbol__exit(&target); - return 0; -} - -static const struct ins_ops loongarch_call_ops = { - .parse = loongarch_call__parse, - .scnprintf = call__scnprintf, -}; - -static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops, - struct map_symbol *ms, - struct disasm_line *dl __maybe_unused) - -{ - struct map *map = ms->map; - struct symbol *sym = ms->sym; - struct addr_map_symbol target = { - .ms = { .map = map__get(map), }, - }; - const char *c = strchr(ops->raw, '#'); - u64 start, end; - - ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); - ops->jump.raw_func_start = strchr(ops->raw, '<'); - - if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) - c = NULL; - - if (c++ != NULL) - ops->target.addr = strtoull(c, NULL, 16); - else - ops->target.addr = strtoull(ops->raw, NULL, 16); - - target.addr = map__objdump_2mem(map, ops->target.addr); - start = map__unmap_ip(map, sym->start); - end = map__unmap_ip(map, sym->end); - - ops->target.outside = target.addr < start || target.addr > end; - - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - if (!ops->target.outside) { - ops->target.offset = target.addr - start; - ops->target.offset_avail = true; - } else { - ops->target.offset_avail = false; - } - addr_map_symbol__exit(&target); - return 0; -} - -static const struct ins_ops loongarch_jump_ops = { - .parse = loongarch_jump__parse, - .scnprintf = jump__scnprintf, -}; - -static -const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name) -{ - const struct ins_ops *ops = NULL; - - if (!strcmp(name, "bl")) - ops = &loongarch_call_ops; - else if (!strcmp(name, "jirl")) - ops = &ret_ops; - else if (!strcmp(name, "b") || - !strncmp(name, "beq", 3) || - !strncmp(name, "bne", 3) || - !strncmp(name, "blt", 3) || - !strncmp(name, "bge", 3) || - !strncmp(name, "bltu", 4) || - !strncmp(name, "bgeu", 4)) - ops = &loongarch_jump_ops; - else - return NULL; - - arch__associate_ins_ops(arch, name, ops); - - return ops; -} - -static -int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - if (!arch->initialized) { - arch->associate_instruction_ops = loongarch__associate_ins_ops; - arch->initialized = true; - arch->objdump.comment_char = '#'; - arch->e_machine = EM_LOONGARCH; - arch->e_flags = 0; - } - - return 0; -} diff --git a/tools/perf/arch/mips/annotate/instructions.c b/tools/perf/arch/mips/annotate/instructions.c deleted file mode 100644 index 0fbe0a7df95a..000000000000 --- a/tools/perf/arch/mips/annotate/instructions.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -static -const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name) -{ - const struct ins_ops *ops = NULL; - - if (!strncmp(name, "bal", 3) || - !strncmp(name, "bgezal", 6) || - !strncmp(name, "bltzal", 6) || - !strncmp(name, "bgtzal", 6) || - !strncmp(name, "blezal", 6) || - !strncmp(name, "beqzal", 6) || - !strncmp(name, "bnezal", 6) || - !strncmp(name, "bgtzl", 5) || - !strncmp(name, "bltzl", 5) || - !strncmp(name, "bgezl", 5) || - !strncmp(name, "blezl", 5) || - !strncmp(name, "jialc", 5) || - !strncmp(name, "beql", 4) || - !strncmp(name, "bnel", 4) || - !strncmp(name, "jal", 3)) - ops = &call_ops; - else if (!strncmp(name, "jr", 2)) - ops = &ret_ops; - else if (name[0] == 'j' || name[0] == 'b') - ops = &jump_ops; - else - return NULL; - - arch__associate_ins_ops(arch, name, ops); - - return ops; -} - -static -int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - if (!arch->initialized) { - arch->associate_instruction_ops = mips__associate_ins_ops; - arch->initialized = true; - arch->objdump.comment_char = '#'; - arch->e_machine = EM_MIPS; - arch->e_flags = 0; - } - - return 0; -} diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c deleted file mode 100644 index d1be55425e35..000000000000 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ /dev/null @@ -1,317 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include - -static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name) -{ - int i; - const struct ins_ops *ops; - - /* - * - Interested only if instruction starts with 'b'. - * - Few start with 'b', but aren't branch instructions. - */ - if (name[0] != 'b' || - !strncmp(name, "bcd", 3) || - !strncmp(name, "brinc", 5) || - !strncmp(name, "bper", 4)) - return NULL; - - ops = &jump_ops; - - i = strlen(name) - 1; - if (i < 0) - return NULL; - - /* ignore optional hints at the end of the instructions */ - if (name[i] == '+' || name[i] == '-') - i--; - - if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) { - /* - * if the instruction ends up with 'l' or 'la', then - * those are considered 'calls' since they update LR. - * ... except for 'bnl' which is branch if not less than - * and the absolute form of the same. - */ - if (strcmp(name, "bnl") && strcmp(name, "bnl+") && - strcmp(name, "bnl-") && strcmp(name, "bnla") && - strcmp(name, "bnla+") && strcmp(name, "bnla-")) - ops = &call_ops; - } - if (name[i] == 'r' && name[i-1] == 'l') - /* - * instructions ending with 'lr' are considered to be - * return instructions - */ - ops = &ret_ops; - - arch__associate_ins_ops(arch, name, ops); - return ops; -} - -#define PPC_OP(op) (((op) >> 26) & 0x3F) -#define PPC_21_30(R) (((R) >> 1) & 0x3ff) -#define PPC_22_30(R) (((R) >> 1) & 0x1ff) - -struct insn_offset { - const char *name; - int value; -}; - -/* - * There are memory instructions with opcode 31 which are - * of X Form, Example: - * ldx RT,RA,RB - * ______________________________________ - * | 31 | RT | RA | RB | 21 |/| - * -------------------------------------- - * 0 6 11 16 21 30 31 - * - * But all instructions with opcode 31 are not memory. - * Example: add RT,RA,RB - * - * Use bits 21 to 30 to check memory insns with 31 as opcode. - * In ins_array below, for ldx instruction: - * name => OP_31_XOP_LDX - * value => 21 - */ - -static struct insn_offset ins_array[] = { - { .name = "OP_31_XOP_LXSIWZX", .value = 12, }, - { .name = "OP_31_XOP_LWARX", .value = 20, }, - { .name = "OP_31_XOP_LDX", .value = 21, }, - { .name = "OP_31_XOP_LWZX", .value = 23, }, - { .name = "OP_31_XOP_LDUX", .value = 53, }, - { .name = "OP_31_XOP_LWZUX", .value = 55, }, - { .name = "OP_31_XOP_LXSIWAX", .value = 76, }, - { .name = "OP_31_XOP_LDARX", .value = 84, }, - { .name = "OP_31_XOP_LBZX", .value = 87, }, - { .name = "OP_31_XOP_LVX", .value = 103, }, - { .name = "OP_31_XOP_LBZUX", .value = 119, }, - { .name = "OP_31_XOP_STXSIWX", .value = 140, }, - { .name = "OP_31_XOP_STDX", .value = 149, }, - { .name = "OP_31_XOP_STWX", .value = 151, }, - { .name = "OP_31_XOP_STDUX", .value = 181, }, - { .name = "OP_31_XOP_STWUX", .value = 183, }, - { .name = "OP_31_XOP_STBX", .value = 215, }, - { .name = "OP_31_XOP_STVX", .value = 231, }, - { .name = "OP_31_XOP_STBUX", .value = 247, }, - { .name = "OP_31_XOP_LHZX", .value = 279, }, - { .name = "OP_31_XOP_LHZUX", .value = 311, }, - { .name = "OP_31_XOP_LXVDSX", .value = 332, }, - { .name = "OP_31_XOP_LWAX", .value = 341, }, - { .name = "OP_31_XOP_LHAX", .value = 343, }, - { .name = "OP_31_XOP_LWAUX", .value = 373, }, - { .name = "OP_31_XOP_LHAUX", .value = 375, }, - { .name = "OP_31_XOP_STHX", .value = 407, }, - { .name = "OP_31_XOP_STHUX", .value = 439, }, - { .name = "OP_31_XOP_LXSSPX", .value = 524, }, - { .name = "OP_31_XOP_LDBRX", .value = 532, }, - { .name = "OP_31_XOP_LSWX", .value = 533, }, - { .name = "OP_31_XOP_LWBRX", .value = 534, }, - { .name = "OP_31_XOP_LFSUX", .value = 567, }, - { .name = "OP_31_XOP_LXSDX", .value = 588, }, - { .name = "OP_31_XOP_LSWI", .value = 597, }, - { .name = "OP_31_XOP_LFDX", .value = 599, }, - { .name = "OP_31_XOP_LFDUX", .value = 631, }, - { .name = "OP_31_XOP_STXSSPX", .value = 652, }, - { .name = "OP_31_XOP_STDBRX", .value = 660, }, - { .name = "OP_31_XOP_STXWX", .value = 661, }, - { .name = "OP_31_XOP_STWBRX", .value = 662, }, - { .name = "OP_31_XOP_STFSX", .value = 663, }, - { .name = "OP_31_XOP_STFSUX", .value = 695, }, - { .name = "OP_31_XOP_STXSDX", .value = 716, }, - { .name = "OP_31_XOP_STSWI", .value = 725, }, - { .name = "OP_31_XOP_STFDX", .value = 727, }, - { .name = "OP_31_XOP_STFDUX", .value = 759, }, - { .name = "OP_31_XOP_LXVW4X", .value = 780, }, - { .name = "OP_31_XOP_LHBRX", .value = 790, }, - { .name = "OP_31_XOP_LXVD2X", .value = 844, }, - { .name = "OP_31_XOP_LFIWAX", .value = 855, }, - { .name = "OP_31_XOP_LFIWZX", .value = 887, }, - { .name = "OP_31_XOP_STXVW4X", .value = 908, }, - { .name = "OP_31_XOP_STHBRX", .value = 918, }, - { .name = "OP_31_XOP_STXVD2X", .value = 972, }, - { .name = "OP_31_XOP_STFIWX", .value = 983, }, -}; - -/* - * Arithmetic instructions which are having opcode as 31. - * These instructions are tracked to save the register state - * changes. Example: - * - * lwz r10,264(r3) - * add r31, r3, r3 - * lwz r9, 0(r31) - * - * Here instruction tracking needs to identify the "add" - * instruction and save data type of r3 to r31. If a sample - * is hit at next "lwz r9, 0(r31)", by this instruction tracking, - * data type of r31 can be resolved. - */ -static struct insn_offset arithmetic_ins_op_31[] = { - { .name = "SUB_CARRY_XO_FORM", .value = 8, }, - { .name = "MUL_HDW_XO_FORM1", .value = 9, }, - { .name = "ADD_CARRY_XO_FORM", .value = 10, }, - { .name = "MUL_HW_XO_FORM1", .value = 11, }, - { .name = "SUB_XO_FORM", .value = 40, }, - { .name = "MUL_HDW_XO_FORM", .value = 73, }, - { .name = "MUL_HW_XO_FORM", .value = 75, }, - { .name = "SUB_EXT_XO_FORM", .value = 136, }, - { .name = "ADD_EXT_XO_FORM", .value = 138, }, - { .name = "SUB_ZERO_EXT_XO_FORM", .value = 200, }, - { .name = "ADD_ZERO_EXT_XO_FORM", .value = 202, }, - { .name = "SUB_EXT_XO_FORM2", .value = 232, }, - { .name = "MUL_DW_XO_FORM", .value = 233, }, - { .name = "ADD_EXT_XO_FORM2", .value = 234, }, - { .name = "MUL_W_XO_FORM", .value = 235, }, - { .name = "ADD_XO_FORM", .value = 266, }, - { .name = "DIV_DW_XO_FORM1", .value = 457, }, - { .name = "DIV_W_XO_FORM1", .value = 459, }, - { .name = "DIV_DW_XO_FORM", .value = 489, }, - { .name = "DIV_W_XO_FORM", .value = 491, }, -}; - -static struct insn_offset arithmetic_two_ops[] = { - { .name = "mulli", .value = 7, }, - { .name = "subfic", .value = 8, }, - { .name = "addic", .value = 12, }, - { .name = "addic.", .value = 13, }, - { .name = "addi", .value = 14, }, - { .name = "addis", .value = 15, }, -}; - -static int cmp_offset(const void *a, const void *b) -{ - const struct insn_offset *val1 = a; - const struct insn_offset *val2 = b; - - return (val1->value - val2->value); -} - -static const struct ins_ops *check_ppc_insn(struct disasm_line *dl) -{ - int raw_insn = dl->raw.raw_insn; - int opcode = PPC_OP(raw_insn); - int mem_insn_31 = PPC_21_30(raw_insn); - struct insn_offset *ret; - struct insn_offset mem_insns_31_opcode = { - "OP_31_INSN", - mem_insn_31 - }; - char name_insn[32]; - - /* - * Instructions with opcode 32 to 63 are memory - * instructions in powerpc - */ - if ((opcode & 0x20)) { - /* - * Set name in case of raw instruction to - * opcode to be used in insn-stat - */ - if (!strlen(dl->ins.name)) { - sprintf(name_insn, "%d", opcode); - dl->ins.name = strdup(name_insn); - } - return &load_store_ops; - } else if (opcode == 31) { - /* Check for memory instructions with opcode 31 */ - ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset); - if (ret) { - if (!strlen(dl->ins.name)) - dl->ins.name = strdup(ret->name); - return &load_store_ops; - } else { - mem_insns_31_opcode.value = PPC_22_30(raw_insn); - ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31), - sizeof(arithmetic_ins_op_31[0]), cmp_offset); - if (ret != NULL) - return &arithmetic_ops; - /* Bits 21 to 30 has value 444 for "mr" insn ie, OR X form */ - if (PPC_21_30(raw_insn) == 444) - return &arithmetic_ops; - } - } else { - mem_insns_31_opcode.value = opcode; - ret = bsearch(&mem_insns_31_opcode, arithmetic_two_ops, ARRAY_SIZE(arithmetic_two_ops), - sizeof(arithmetic_two_ops[0]), cmp_offset); - if (ret != NULL) - return &arithmetic_ops; - } - - return NULL; -} - -/* - * Instruction tracking function to track register state moves. - * Example sequence: - * ld r10,264(r3) - * mr r31,r3 - * < - * ld r9,312(r31) - * - * Previous instruction sequence shows that register state of r3 - * is moved to r31. update_insn_state_powerpc tracks these state - * changes - */ -#ifdef HAVE_LIBDW_SUPPORT -static void update_insn_state_powerpc(struct type_state *state, - struct data_loc_info *dloc, Dwarf_Die * cu_die __maybe_unused, - struct disasm_line *dl) -{ - struct annotated_insn_loc loc; - struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; - struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; - struct type_state_reg *tsr; - u32 insn_offset = dl->al.offset; - - if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) - return; - - /* - * Value 444 for bits 21:30 is for "mr" - * instruction. "mr" is extended OR. So set the - * source and destination reg correctly - */ - if (PPC_21_30(dl->raw.raw_insn) == 444) { - int src_reg = src->reg1; - - src->reg1 = dst->reg1; - dst->reg1 = src_reg; - } - - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - - if (!has_reg_type(state, src->reg1) || - !state->regs[src->reg1].ok) { - tsr->ok = false; - return; - } - - tsr->type = state->regs[src->reg1].type; - tsr->kind = state->regs[src->reg1].kind; - tsr->ok = true; - - pr_debug_dtp("mov [%x] reg%d -> reg%d", - insn_offset, src->reg1, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); -} -#endif /* HAVE_LIBDW_SUPPORT */ - -static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - if (!arch->initialized) { - arch->initialized = true; - arch->associate_instruction_ops = powerpc__associate_instruction_ops; - arch->objdump.comment_char = '#'; - annotate_opts.show_asm_raw = true; - arch->e_machine = EM_PPC; - arch->e_flags = 0; - } - - return 0; -} diff --git a/tools/perf/arch/riscv64/annotate/instructions.c b/tools/perf/arch/riscv64/annotate/instructions.c deleted file mode 100644 index a34798864fab..000000000000 --- a/tools/perf/arch/riscv64/annotate/instructions.c +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -static -const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name) -{ - const struct ins_ops *ops = NULL; - - if (!strncmp(name, "jal", 3) || - !strncmp(name, "jr", 2) || - !strncmp(name, "call", 4)) - ops = &call_ops; - else if (!strncmp(name, "ret", 3)) - ops = &ret_ops; - else if (name[0] == 'j' || name[0] == 'b') - ops = &jump_ops; - else - return NULL; - - arch__associate_ins_ops(arch, name, ops); - - return ops; -} - -static -int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - if (!arch->initialized) { - arch->associate_instruction_ops = riscv64__associate_ins_ops; - arch->initialized = true; - arch->objdump.comment_char = '#'; - arch->e_machine = EM_RISCV; - arch->e_flags = 0; - } - - return 0; -} diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c deleted file mode 100644 index 1b22e6276e7d..000000000000 --- a/tools/perf/arch/s390/annotate/instructions.c +++ /dev/null @@ -1,178 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include - -static int s390_call__parse(const struct arch *arch, struct ins_operands *ops, - struct map_symbol *ms, - struct disasm_line *dl __maybe_unused) -{ - char *endptr, *tok, *name; - struct map *map = ms->map; - struct addr_map_symbol target; - - tok = strchr(ops->raw, ','); - if (!tok) - return -1; - - ops->target.addr = strtoull(tok + 1, &endptr, 16); - - name = strchr(endptr, '<'); - if (name == NULL) - return -1; - - name++; - - if (arch->objdump.skip_functions_char && - strchr(name, arch->objdump.skip_functions_char)) - return -1; - - tok = strchr(name, '>'); - if (tok == NULL) - return -1; - - *tok = '\0'; - ops->target.name = strdup(name); - *tok = '>'; - - if (ops->target.name == NULL) - return -1; - - target = (struct addr_map_symbol) { - .ms = { .map = map__get(map), }, - .addr = map__objdump_2mem(map, ops->target.addr), - }; - - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - addr_map_symbol__exit(&target); - return 0; -} - -static const struct ins_ops s390_call_ops = { - .parse = s390_call__parse, - .scnprintf = call__scnprintf, -}; - -static int s390_mov__parse(const struct arch *arch __maybe_unused, - struct ins_operands *ops, - struct map_symbol *ms __maybe_unused, - struct disasm_line *dl __maybe_unused) -{ - char *s = strchr(ops->raw, ','), *target, *endptr; - - if (s == NULL) - return -1; - - *s = '\0'; - ops->source.raw = strdup(ops->raw); - *s = ','; - - if (ops->source.raw == NULL) - return -1; - - target = ++s; - ops->target.raw = strdup(target); - if (ops->target.raw == NULL) - goto out_free_source; - - ops->target.addr = strtoull(target, &endptr, 16); - if (endptr == target) - goto out_free_target; - - s = strchr(endptr, '<'); - if (s == NULL) - goto out_free_target; - endptr = strchr(s + 1, '>'); - if (endptr == NULL) - goto out_free_target; - - *endptr = '\0'; - ops->target.name = strdup(s + 1); - *endptr = '>'; - if (ops->target.name == NULL) - goto out_free_target; - - return 0; - -out_free_target: - zfree(&ops->target.raw); -out_free_source: - zfree(&ops->source.raw); - return -1; -} - - -static const struct ins_ops s390_mov_ops = { - .parse = s390_mov__parse, - .scnprintf = mov__scnprintf, -}; - -static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name) -{ - const struct ins_ops *ops = NULL; - - /* catch all kind of jumps */ - if (strchr(name, 'j') || - !strncmp(name, "bct", 3) || - !strncmp(name, "br", 2)) - ops = &jump_ops; - /* override call/returns */ - if (!strcmp(name, "bras") || - !strcmp(name, "brasl") || - !strcmp(name, "basr")) - ops = &s390_call_ops; - if (!strcmp(name, "br")) - ops = &ret_ops; - /* override load/store relative to PC */ - if (!strcmp(name, "lrl") || - !strcmp(name, "lgrl") || - !strcmp(name, "lgfrl") || - !strcmp(name, "llgfrl") || - !strcmp(name, "strl") || - !strcmp(name, "stgrl")) - ops = &s390_mov_ops; - - if (ops) - arch__associate_ins_ops(arch, name, ops); - return ops; -} - -static int s390__cpuid_parse(struct arch *arch, char *cpuid) -{ - unsigned int family; - char model[16], model_c[16], cpumf_v[16], cpumf_a[16]; - int ret; - - /* - * cpuid string format: - * "IBM,family,model-capacity,model[,cpum_cf-version,cpum_cf-authorization]" - */ - ret = sscanf(cpuid, "%*[^,],%u,%[^,],%[^,],%[^,],%s", &family, model_c, - model, cpumf_v, cpumf_a); - if (ret >= 2) { - arch->family = family; - arch->model = 0; - return 0; - } - - return -1; -} - -static int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - int err = 0; - - if (!arch->initialized) { - arch->initialized = true; - arch->associate_instruction_ops = s390__associate_ins_ops; - if (cpuid) { - if (s390__cpuid_parse(arch, cpuid)) - err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; - } - arch->e_machine = EM_S390; - arch->e_flags = 0; - } - - return err; -} diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/arch/sparc/annotate/instructions.c deleted file mode 100644 index a08d8734c883..000000000000 --- a/tools/perf/arch/sparc/annotate/instructions.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -static int is_branch_cond(const char *cond) -{ - if (cond[0] == '\0') - return 1; - - if (cond[0] == 'a' && cond[1] == '\0') - return 1; - - if (cond[0] == 'c' && - (cond[1] == 'c' || cond[1] == 's') && - cond[2] == '\0') - return 1; - - if (cond[0] == 'e' && - (cond[1] == '\0' || - (cond[1] == 'q' && cond[2] == '\0'))) - return 1; - - if (cond[0] == 'g' && - (cond[1] == '\0' || - (cond[1] == 't' && cond[2] == '\0') || - (cond[1] == 'e' && cond[2] == '\0') || - (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) - return 1; - - if (cond[0] == 'l' && - (cond[1] == '\0' || - (cond[1] == 't' && cond[2] == '\0') || - (cond[1] == 'u' && cond[2] == '\0') || - (cond[1] == 'e' && cond[2] == '\0') || - (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) - return 1; - - if (cond[0] == 'n' && - (cond[1] == '\0' || - (cond[1] == 'e' && cond[2] == '\0') || - (cond[1] == 'z' && cond[2] == '\0') || - (cond[1] == 'e' && cond[2] == 'g' && cond[3] == '\0'))) - return 1; - - if (cond[0] == 'b' && - cond[1] == 'p' && - cond[2] == 'o' && - cond[3] == 's' && - cond[4] == '\0') - return 1; - - if (cond[0] == 'v' && - (cond[1] == 'c' || cond[1] == 's') && - cond[2] == '\0') - return 1; - - if (cond[0] == 'b' && - cond[1] == 'z' && - cond[2] == '\0') - return 1; - - return 0; -} - -static int is_branch_reg_cond(const char *cond) -{ - if ((cond[0] == 'n' || cond[0] == 'l') && - cond[1] == 'z' && - cond[2] == '\0') - return 1; - - if (cond[0] == 'z' && - cond[1] == '\0') - return 1; - - if ((cond[0] == 'g' || cond[0] == 'l') && - cond[1] == 'e' && - cond[2] == 'z' && - cond[3] == '\0') - return 1; - - if (cond[0] == 'g' && - cond[1] == 'z' && - cond[2] == '\0') - return 1; - - return 0; -} - -static int is_branch_float_cond(const char *cond) -{ - if (cond[0] == '\0') - return 1; - - if ((cond[0] == 'a' || cond[0] == 'e' || - cond[0] == 'z' || cond[0] == 'g' || - cond[0] == 'l' || cond[0] == 'n' || - cond[0] == 'o' || cond[0] == 'u') && - cond[1] == '\0') - return 1; - - if (((cond[0] == 'g' && cond[1] == 'e') || - (cond[0] == 'l' && (cond[1] == 'e' || - cond[1] == 'g')) || - (cond[0] == 'n' && (cond[1] == 'e' || - cond[1] == 'z')) || - (cond[0] == 'u' && (cond[1] == 'e' || - cond[1] == 'g' || - cond[1] == 'l'))) && - cond[2] == '\0') - return 1; - - if (cond[0] == 'u' && - (cond[1] == 'g' || cond[1] == 'l') && - cond[2] == 'e' && - cond[3] == '\0') - return 1; - - return 0; -} - -static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name) -{ - const struct ins_ops *ops = NULL; - - if (!strcmp(name, "call") || - !strcmp(name, "jmp") || - !strcmp(name, "jmpl")) { - ops = &call_ops; - } else if (!strcmp(name, "ret") || - !strcmp(name, "retl") || - !strcmp(name, "return")) { - ops = &ret_ops; - } else if (!strcmp(name, "mov")) { - ops = &mov_ops; - } else { - if (name[0] == 'c' && - (name[1] == 'w' || name[1] == 'x')) - name += 2; - - if (name[0] == 'b') { - const char *cond = name + 1; - - if (cond[0] == 'r') { - if (is_branch_reg_cond(cond + 1)) - ops = &jump_ops; - } else if (is_branch_cond(cond)) { - ops = &jump_ops; - } - } else if (name[0] == 'f' && name[1] == 'b') { - if (is_branch_float_cond(name + 2)) - ops = &jump_ops; - } - } - - if (ops) - arch__associate_ins_ops(arch, name, ops); - - return ops; -} - -static int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) -{ - if (!arch->initialized) { - arch->initialized = true; - arch->associate_instruction_ops = sparc__associate_instruction_ops; - arch->objdump.comment_char = '#'; - arch->e_machine = EM_SPARC; - arch->e_flags = 0; - } - - return 0; -} diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c deleted file mode 100644 index ffca3029388b..000000000000 --- a/tools/perf/arch/x86/annotate/instructions.c +++ /dev/null @@ -1,797 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * x86 instruction nmemonic table to parse disasm lines for annotate. - * This table is searched twice - one for exact match and another for - * match without a size suffix (b, w, l, q) in case of AT&T syntax. - * - * So this table should not have entries with the suffix unless it's - * a complete different instruction than ones without the suffix. - */ -static const struct ins x86__instructions[] = { - { .name = "adc", .ops = &mov_ops, }, - { .name = "add", .ops = &mov_ops, }, - { .name = "addsd", .ops = &mov_ops, }, - { .name = "and", .ops = &mov_ops, }, - { .name = "andpd", .ops = &mov_ops, }, - { .name = "andps", .ops = &mov_ops, }, - { .name = "bsr", .ops = &mov_ops, }, - { .name = "bt", .ops = &mov_ops, }, - { .name = "btr", .ops = &mov_ops, }, - { .name = "bts", .ops = &mov_ops, }, - { .name = "call", .ops = &call_ops, }, - { .name = "cmovae", .ops = &mov_ops, }, - { .name = "cmovbe", .ops = &mov_ops, }, - { .name = "cmove", .ops = &mov_ops, }, - { .name = "cmp", .ops = &mov_ops, }, - { .name = "cmpxch", .ops = &mov_ops, }, - { .name = "cmpxchg", .ops = &mov_ops, }, - { .name = "cs", .ops = &mov_ops, }, - { .name = "dec", .ops = &dec_ops, }, - { .name = "divsd", .ops = &mov_ops, }, - { .name = "divss", .ops = &mov_ops, }, - { .name = "gs", .ops = &mov_ops, }, - { .name = "imul", .ops = &mov_ops, }, - { .name = "inc", .ops = &dec_ops, }, - { .name = "ja", .ops = &jump_ops, }, - { .name = "jae", .ops = &jump_ops, }, - { .name = "jb", .ops = &jump_ops, }, - { .name = "jbe", .ops = &jump_ops, }, - { .name = "jc", .ops = &jump_ops, }, - { .name = "jcxz", .ops = &jump_ops, }, - { .name = "je", .ops = &jump_ops, }, - { .name = "jecxz", .ops = &jump_ops, }, - { .name = "jg", .ops = &jump_ops, }, - { .name = "jge", .ops = &jump_ops, }, - { .name = "jl", .ops = &jump_ops, }, - { .name = "jle", .ops = &jump_ops, }, - { .name = "jmp", .ops = &jump_ops, }, - { .name = "jna", .ops = &jump_ops, }, - { .name = "jnae", .ops = &jump_ops, }, - { .name = "jnb", .ops = &jump_ops, }, - { .name = "jnbe", .ops = &jump_ops, }, - { .name = "jnc", .ops = &jump_ops, }, - { .name = "jne", .ops = &jump_ops, }, - { .name = "jng", .ops = &jump_ops, }, - { .name = "jnge", .ops = &jump_ops, }, - { .name = "jnl", .ops = &jump_ops, }, - { .name = "jnle", .ops = &jump_ops, }, - { .name = "jno", .ops = &jump_ops, }, - { .name = "jnp", .ops = &jump_ops, }, - { .name = "jns", .ops = &jump_ops, }, - { .name = "jnz", .ops = &jump_ops, }, - { .name = "jo", .ops = &jump_ops, }, - { .name = "jp", .ops = &jump_ops, }, - { .name = "jpe", .ops = &jump_ops, }, - { .name = "jpo", .ops = &jump_ops, }, - { .name = "jrcxz", .ops = &jump_ops, }, - { .name = "js", .ops = &jump_ops, }, - { .name = "jz", .ops = &jump_ops, }, - { .name = "lea", .ops = &mov_ops, }, - { .name = "lock", .ops = &lock_ops, }, - { .name = "mov", .ops = &mov_ops, }, - { .name = "movapd", .ops = &mov_ops, }, - { .name = "movaps", .ops = &mov_ops, }, - { .name = "movdqa", .ops = &mov_ops, }, - { .name = "movdqu", .ops = &mov_ops, }, - { .name = "movsb", .ops = &mov_ops, }, - { .name = "movsd", .ops = &mov_ops, }, - { .name = "movsl", .ops = &mov_ops, }, - { .name = "movss", .ops = &mov_ops, }, - { .name = "movsw", .ops = &mov_ops, }, - { .name = "movupd", .ops = &mov_ops, }, - { .name = "movups", .ops = &mov_ops, }, - { .name = "movzb", .ops = &mov_ops, }, - { .name = "movzl", .ops = &mov_ops, }, - { .name = "movzw", .ops = &mov_ops, }, - { .name = "mulsd", .ops = &mov_ops, }, - { .name = "mulss", .ops = &mov_ops, }, - { .name = "nop", .ops = &nop_ops, }, - { .name = "or", .ops = &mov_ops, }, - { .name = "orps", .ops = &mov_ops, }, - { .name = "paddq", .ops = &mov_ops, }, - { .name = "pand", .ops = &mov_ops, }, - { .name = "pcmpeqb", .ops = &mov_ops, }, - { .name = "por", .ops = &mov_ops, }, - { .name = "rcl", .ops = &mov_ops, }, - { .name = "ret", .ops = &ret_ops, }, - { .name = "sbb", .ops = &mov_ops, }, - { .name = "sete", .ops = &mov_ops, }, - { .name = "sub", .ops = &mov_ops, }, - { .name = "subsd", .ops = &mov_ops, }, - { .name = "test", .ops = &mov_ops, }, - { .name = "tzcnt", .ops = &mov_ops, }, - { .name = "ucomisd", .ops = &mov_ops, }, - { .name = "ucomiss", .ops = &mov_ops, }, - { .name = "vaddsd", .ops = &mov_ops, }, - { .name = "vandpd", .ops = &mov_ops, }, - { .name = "vmovdqa", .ops = &mov_ops, }, - { .name = "vmovq", .ops = &mov_ops, }, - { .name = "vmovsd", .ops = &mov_ops, }, - { .name = "vmulsd", .ops = &mov_ops, }, - { .name = "vorpd", .ops = &mov_ops, }, - { .name = "vsubsd", .ops = &mov_ops, }, - { .name = "vucomisd", .ops = &mov_ops, }, - { .name = "xadd", .ops = &mov_ops, }, - { .name = "xbegin", .ops = &jump_ops, }, - { .name = "xchg", .ops = &mov_ops, }, - { .name = "xor", .ops = &mov_ops, }, - { .name = "xorpd", .ops = &mov_ops, }, - { .name = "xorps", .ops = &mov_ops, }, -}; - -static bool amd__ins_is_fused(const struct arch *arch, const char *ins1, - const char *ins2) -{ - if (strstr(ins2, "jmp")) - return false; - - /* Family >= 15h supports cmp/test + branch fusion */ - if (arch->family >= 0x15 && (strstarts(ins1, "test") || - (strstarts(ins1, "cmp") && !strstr(ins1, "xchg")))) { - return true; - } - - /* Family >= 19h supports some ALU + branch fusion */ - if (arch->family >= 0x19 && (strstarts(ins1, "add") || - strstarts(ins1, "sub") || strstarts(ins1, "and") || - strstarts(ins1, "inc") || strstarts(ins1, "dec") || - strstarts(ins1, "or") || strstarts(ins1, "xor"))) { - return true; - } - - return false; -} - -static bool intel__ins_is_fused(const struct arch *arch, const char *ins1, - const char *ins2) -{ - if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp")) - return false; - - if (arch->model == 0x1e) { - /* Nehalem */ - if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) || - strstr(ins1, "test")) { - return true; - } - } else { - /* Newer platform */ - if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) || - strstr(ins1, "test") || - strstr(ins1, "add") || - strstr(ins1, "sub") || - strstr(ins1, "and") || - strstr(ins1, "inc") || - strstr(ins1, "dec")) { - return true; - } - } - - return false; -} - -static int x86__cpuid_parse(struct arch *arch, char *cpuid) -{ - unsigned int family, model, stepping; - int ret; - - /* - * cpuid = "GenuineIntel,family,model,stepping" - */ - ret = sscanf(cpuid, "%*[^,],%u,%u,%u", &family, &model, &stepping); - if (ret == 3) { - arch->family = family; - arch->model = model; - arch->ins_is_fused = strstarts(cpuid, "AuthenticAMD") ? - amd__ins_is_fused : - intel__ins_is_fused; - return 0; - } - - return -1; -} - -static int x86__annotate_init(struct arch *arch, char *cpuid) -{ - int err = 0; - - if (arch->initialized) - return 0; - - if (cpuid) { - if (x86__cpuid_parse(arch, cpuid)) - err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; - } - -#ifndef NDEBUG - { - static bool sorted_check; - - if (!sorted_check) { - for (size_t i = 0; i < arch->nr_instructions - 1; i++) { - assert(strcmp(arch->instructions[i].name, - arch->instructions[i + 1].name) <= 0); - } - sorted_check = true; - } - } -#endif - arch->e_machine = EM_X86_64; - arch->e_flags = 0; - arch->initialized = true; - return err; -} - -#ifdef HAVE_LIBDW_SUPPORT -static void update_insn_state_x86(struct type_state *state, - struct data_loc_info *dloc, Dwarf_Die *cu_die, - struct disasm_line *dl) -{ - struct annotated_insn_loc loc; - struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; - struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; - struct type_state_reg *tsr; - Dwarf_Die type_die; - u32 insn_offset = dl->al.offset; - int fbreg = dloc->fbreg; - int fboff = 0; - - if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) - return; - - if (ins__is_call(&dl->ins)) { - struct symbol *func = dl->ops.target.sym; - - if (func == NULL) - return; - - /* __fentry__ will preserve all registers */ - if (!strcmp(func->name, "__fentry__")) - return; - - pr_debug_dtp("call [%x] %s\n", insn_offset, func->name); - - /* Otherwise invalidate caller-saved registers after call */ - for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) { - if (state->regs[i].caller_saved) - state->regs[i].ok = false; - } - - /* Update register with the return type (if any) */ - if (die_find_func_rettype(cu_die, func->name, &type_die)) { - tsr = &state->regs[state->ret_reg]; - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("call [%x] return -> reg%d", - insn_offset, state->ret_reg); - pr_debug_type_name(&type_die, tsr->kind); - } - return; - } - - if (!strncmp(dl->ins.name, "add", 3)) { - u64 imm_value = -1ULL; - int offset; - const char *var_name = NULL; - struct map_symbol *ms = dloc->ms; - u64 ip = ms->sym->start + dl->al.offset; - - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - tsr->copied_from = -1; - - if (src->imm) - imm_value = src->offset; - else if (has_reg_type(state, src->reg1) && - state->regs[src->reg1].kind == TSR_KIND_CONST) - imm_value = state->regs[src->reg1].imm_value; - else if (src->reg1 == DWARF_REG_PC) { - u64 var_addr = annotate_calc_pcrel(dloc->ms, ip, - src->offset, dl); - - if (get_global_var_info(dloc, var_addr, - &var_name, &offset) && - !strcmp(var_name, "this_cpu_off") && - tsr->kind == TSR_KIND_CONST) { - tsr->kind = TSR_KIND_PERCPU_BASE; - tsr->offset = 0; - tsr->ok = true; - imm_value = tsr->imm_value; - } - } - else - return; - - /* Ignore add to non-pointer or non-const types */ - if (tsr->kind == TSR_KIND_POINTER || - (dwarf_tag(&tsr->type) == DW_TAG_pointer_type && - src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) { - tsr->offset += imm_value; - pr_debug_dtp("add [%x] offset %#"PRIx64" to reg%d", - insn_offset, imm_value, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - - if (tsr->kind == TSR_KIND_CONST) - tsr->imm_value += imm_value; - - if (tsr->kind != TSR_KIND_PERCPU_BASE) - return; - - if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset, - &type_die) && offset == 0) { - /* - * This is not a pointer type, but it should be treated - * as a pointer. - */ - tsr->type = type_die; - tsr->kind = TSR_KIND_PERCPU_POINTER; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", - insn_offset, imm_value, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - return; - } - - if (!strncmp(dl->ins.name, "sub", 3)) { - u64 imm_value = -1ULL; - - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - tsr->copied_from = -1; - - if (src->imm) - imm_value = src->offset; - else if (has_reg_type(state, src->reg1) && - state->regs[src->reg1].kind == TSR_KIND_CONST) - imm_value = state->regs[src->reg1].imm_value; - - if (tsr->kind == TSR_KIND_POINTER || - (dwarf_tag(&tsr->type) == DW_TAG_pointer_type && - src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) { - tsr->offset -= imm_value; - pr_debug_dtp("sub [%x] offset %#"PRIx64" to reg%d", - insn_offset, imm_value, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - - if (tsr->kind == TSR_KIND_CONST) - tsr->imm_value -= imm_value; - - return; - } - - if (!strncmp(dl->ins.name, "lea", 3)) { - int sreg = src->reg1; - struct type_state_reg src_tsr; - - if (!has_reg_type(state, sreg) || - !has_reg_type(state, dst->reg1) || - !src->mem_ref) - return; - - src_tsr = state->regs[sreg]; - tsr = &state->regs[dst->reg1]; - - tsr->copied_from = -1; - tsr->ok = false; - - /* Case 1: Based on stack pointer or frame pointer */ - if (sreg == fbreg || sreg == state->stack_reg) { - struct type_state_stack *stack; - int offset = src->offset - fboff; - - stack = find_stack_state(state, offset); - if (!stack) - return; - - tsr->type = stack->type; - tsr->kind = TSR_KIND_POINTER; - tsr->offset = offset - stack->offset; - tsr->ok = true; - - if (sreg == fbreg) { - pr_debug_dtp("lea [%x] address of -%#x(stack) -> reg%d", - insn_offset, -src->offset, dst->reg1); - } else { - pr_debug_dtp("lea [%x] address of %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - } - - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Case 2: Based on a register holding a typed pointer */ - else if (src_tsr.ok && (src_tsr.kind == TSR_KIND_POINTER || - (dwarf_tag(&src_tsr.type) == DW_TAG_pointer_type && - src_tsr.kind == TSR_KIND_TYPE))) { - - if (src_tsr.kind == TSR_KIND_TYPE && - __die_get_real_type(&state->regs[sreg].type, &type_die) == NULL) - return; - - if (src_tsr.kind == TSR_KIND_POINTER) - type_die = state->regs[sreg].type; - - /* Check if the target type has a member at the new offset */ - if (die_get_member_type(&type_die, - src->offset + src_tsr.offset, &type_die) == NULL) - return; - - tsr->type = src_tsr.type; - tsr->kind = src_tsr.kind; - tsr->offset = src->offset + src_tsr.offset; - tsr->ok = true; - - pr_debug_dtp("lea [%x] address of %s%#x(reg%d) -> reg%d", - insn_offset, src->offset < 0 ? "-" : "", - abs(src->offset), sreg, dst->reg1); - - pr_debug_type_name(&tsr->type, tsr->kind); - } - return; - } - - /* Invalidate register states for other ops which may change pointers */ - if (has_reg_type(state, dst->reg1) && !dst->mem_ref && - dwarf_tag(&state->regs[dst->reg1].type) == DW_TAG_pointer_type) { - if (!strncmp(dl->ins.name, "imul", 4) || !strncmp(dl->ins.name, "mul", 3) || - !strncmp(dl->ins.name, "idiv", 4) || !strncmp(dl->ins.name, "div", 3) || - !strncmp(dl->ins.name, "shl", 3) || !strncmp(dl->ins.name, "shr", 3) || - !strncmp(dl->ins.name, "sar", 3) || !strncmp(dl->ins.name, "and", 3) || - !strncmp(dl->ins.name, "or", 2) || !strncmp(dl->ins.name, "neg", 3) || - !strncmp(dl->ins.name, "inc", 3) || !strncmp(dl->ins.name, "dec", 3)) { - pr_debug_dtp("%s [%x] invalidate reg%d\n", - dl->ins.name, insn_offset, dst->reg1); - state->regs[dst->reg1].ok = false; - state->regs[dst->reg1].copied_from = -1; - return; - } - - if (!strncmp(dl->ins.name, "xor", 3) && dst->reg1 == src->reg1) { - /* xor reg, reg clears the register */ - pr_debug_dtp("xor [%x] clear reg%d\n", - insn_offset, dst->reg1); - - state->regs[dst->reg1].kind = TSR_KIND_CONST; - state->regs[dst->reg1].imm_value = 0; - state->regs[dst->reg1].ok = true; - state->regs[dst->reg1].copied_from = -1; - return; - } - } - - if (strncmp(dl->ins.name, "mov", 3)) - return; - - if (dloc->fb_cfa) { - u64 ip = dloc->ms->sym->start + dl->al.offset; - u64 pc = map__rip_2objdump(dloc->ms->map, ip); - - if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) - fbreg = -1; - } - - /* Case 1. register to register or segment:offset to register transfers */ - if (!src->mem_ref && !dst->mem_ref) { - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - tsr->copied_from = -1; - - if (dso__kernel(map__dso(dloc->ms->map)) && - src->segment == INSN_SEG_X86_GS && src->imm) { - u64 ip = dloc->ms->sym->start + dl->al.offset; - u64 var_addr; - int offset; - - /* - * In kernel, %gs points to a per-cpu region for the - * current CPU. Access with a constant offset should - * be treated as a global variable access. - */ - var_addr = src->offset; - - if (var_addr == 40) { - tsr->kind = TSR_KIND_CANARY; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] stack canary -> reg%d\n", - insn_offset, dst->reg1); - return; - } - - if (!get_global_var_type(cu_die, dloc, ip, var_addr, - &offset, &type_die) || - !die_get_member_type(&type_die, offset, &type_die)) { - tsr->ok = false; - return; - } - - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", - insn_offset, var_addr, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - return; - } - - if (src->imm) { - tsr->kind = TSR_KIND_CONST; - tsr->imm_value = src->offset; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n", - insn_offset, tsr->imm_value, dst->reg1); - return; - } - - if (!has_reg_type(state, src->reg1) || - !state->regs[src->reg1].ok) { - tsr->ok = false; - return; - } - - tsr->type = state->regs[src->reg1].type; - tsr->kind = state->regs[src->reg1].kind; - tsr->imm_value = state->regs[src->reg1].imm_value; - tsr->offset = state->regs[src->reg1].offset; - tsr->ok = true; - - /* To copy back the variable type later (hopefully) */ - if (tsr->kind == TSR_KIND_TYPE || tsr->kind == TSR_KIND_POINTER) - tsr->copied_from = src->reg1; - - pr_debug_dtp("mov [%x] reg%d -> reg%d", - insn_offset, src->reg1, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Case 2. memory to register transers */ - if (src->mem_ref && !dst->mem_ref) { - int sreg = src->reg1; - - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - tsr->copied_from = -1; - -retry: - /* Check stack variables with offset */ - if (sreg == fbreg || sreg == state->stack_reg) { - struct type_state_stack *stack; - int offset = src->offset - fboff; - - stack = find_stack_state(state, offset); - if (stack == NULL) { - tsr->ok = false; - return; - } else if (!stack->compound) { - tsr->type = stack->type; - tsr->kind = stack->kind; - tsr->offset = stack->ptr_offset; - tsr->ok = true; - } else if (die_get_member_type(&stack->type, - offset - stack->offset, - &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - } else { - tsr->ok = false; - return; - } - - if (sreg == fbreg) { - pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", - insn_offset, -offset, dst->reg1); - } else { - pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", - insn_offset, offset, sreg, dst->reg1); - } - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* And then dereference the pointer if it has one */ - else if (has_reg_type(state, sreg) && state->regs[sreg].ok && - state->regs[sreg].kind == TSR_KIND_TYPE && - die_deref_ptr_type(&state->regs[sreg].type, - src->offset + state->regs[sreg].offset, &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Handle dereference of TSR_KIND_POINTER registers */ - else if (has_reg_type(state, sreg) && state->regs[sreg].ok && - state->regs[sreg].kind == TSR_KIND_POINTER && - die_get_member_type(&state->regs[sreg].type, - src->offset + state->regs[sreg].offset, &type_die)) { - tsr->type = state->regs[sreg].type; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = src->offset + state->regs[sreg].offset; - tsr->ok = true; - - pr_debug_dtp("mov [%x] addr %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Or check if it's a global variable */ - else if (sreg == DWARF_REG_PC) { - struct map_symbol *ms = dloc->ms; - u64 ip = ms->sym->start + dl->al.offset; - u64 addr; - int offset; - - addr = annotate_calc_pcrel(ms, ip, src->offset, dl); - - if (!get_global_var_type(cu_die, dloc, ip, addr, &offset, - &type_die) || - !die_get_member_type(&type_die, offset, &type_die)) { - tsr->ok = false; - return; - } - - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", - insn_offset, addr, dst->reg1); - pr_debug_type_name(&type_die, tsr->kind); - } - /* And check percpu access with base register */ - else if (has_reg_type(state, sreg) && - state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { - u64 ip = dloc->ms->sym->start + dl->al.offset; - u64 var_addr = src->offset; - int offset; - - if (src->multi_regs) { - int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1; - - if (has_reg_type(state, reg2) && state->regs[reg2].ok && - state->regs[reg2].kind == TSR_KIND_CONST) - var_addr += state->regs[reg2].imm_value; - } - - /* - * In kernel, %gs points to a per-cpu region for the - * current CPU. Access with a constant offset should - * be treated as a global variable access. - */ - if (get_global_var_type(cu_die, dloc, ip, var_addr, - &offset, &type_die) && - die_get_member_type(&type_die, offset, &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - - if (src->multi_regs) { - pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d", - insn_offset, src->offset, src->reg1, - src->reg2, dst->reg1); - } else { - pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - } - pr_debug_type_name(&tsr->type, tsr->kind); - } else { - tsr->ok = false; - } - } - /* And then dereference the calculated pointer if it has one */ - else if (has_reg_type(state, sreg) && state->regs[sreg].ok && - state->regs[sreg].kind == TSR_KIND_PERCPU_POINTER && - die_get_member_type(&state->regs[sreg].type, - src->offset, &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Or try another register if any */ - else if (src->multi_regs && sreg == src->reg1 && - src->reg1 != src->reg2) { - sreg = src->reg2; - goto retry; - } - else { - int offset; - const char *var_name = NULL; - - /* it might be per-cpu variable (in kernel) access */ - if (src->offset < 0) { - if (get_global_var_info(dloc, (s64)src->offset, - &var_name, &offset) && - !strcmp(var_name, "__per_cpu_offset")) { - tsr->kind = TSR_KIND_PERCPU_BASE; - tsr->offset = 0; - tsr->ok = true; - - pr_debug_dtp("mov [%x] percpu base reg%d\n", - insn_offset, dst->reg1); - return; - } - } - - tsr->ok = false; - } - } - /* Case 3. register to memory transfers */ - if (!src->mem_ref && dst->mem_ref) { - if (!has_reg_type(state, src->reg1) || - !state->regs[src->reg1].ok) - return; - - /* Check stack variables with offset */ - if (dst->reg1 == fbreg || dst->reg1 == state->stack_reg) { - struct type_state_stack *stack; - int offset = dst->offset - fboff; - - tsr = &state->regs[src->reg1]; - - stack = find_stack_state(state, offset); - if (stack) { - /* - * The source register is likely to hold a type - * of member if it's a compound type. Do not - * update the stack variable type since we can - * get the member type later by using the - * die_get_member_type(). - */ - if (!stack->compound) - set_stack_state(stack, offset, tsr->kind, - &tsr->type, tsr->offset); - } else { - findnew_stack_state(state, offset, tsr->kind, - &tsr->type, tsr->offset); - } - - if (dst->reg1 == fbreg) { - pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", - insn_offset, src->reg1, -offset); - } else { - pr_debug_dtp("mov [%x] reg%d -> %#x(reg%d)", - insn_offset, src->reg1, offset, dst->reg1); - } - if (tsr->offset != 0) { - pr_debug_dtp(" reg%d offset %#x ->", - src->reg1, tsr->offset); - } - - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* - * Ignore other transfers since it'd set a value in a struct - * and won't change the type. - */ - } - /* Case 4. memory to memory transfers (not handled for now) */ -} -#endif diff --git a/tools/perf/util/Build b/tools/perf/util/Build index c30ff257f8b4..b9925c6902ca 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -1,6 +1,7 @@ include $(srctree)/tools/scripts/Makefile.include include $(srctree)/tools/scripts/utilities.mak +perf-util-y += annotate-arch/ perf-util-y += arm64-frame-pointer-unwind-support.o perf-util-y += addr2line.o perf-util-y += addr_location.o diff --git a/tools/perf/util/annotate-arch/Build b/tools/perf/util/annotate-arch/Build new file mode 100644 index 000000000000..23316743fdc5 --- /dev/null +++ b/tools/perf/util/annotate-arch/Build @@ -0,0 +1,11 @@ +perf-util-y += annotate-arc.o +perf-util-y += annotate-arm.o +perf-util-y += annotate-arm64.o +perf-util-y += annotate-csky.o +perf-util-y += annotate-loongarch.o +perf-util-y += annotate-mips.o +perf-util-y += annotate-x86.o +perf-util-y += annotate-powerpc.o +perf-util-y += annotate-riscv64.o +perf-util-y += annotate-s390.o +perf-util-y += annotate-sparc.o diff --git a/tools/perf/util/annotate-arch/annotate-arc.c b/tools/perf/util/annotate-arch/annotate-arc.c new file mode 100644 index 000000000000..d7ca08ca5600 --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-arc.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "../disasm.h" + +int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + arch->initialized = true; + arch->objdump.comment_char = ';'; + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-arm.c b/tools/perf/util/annotate-arch/annotate-arm.c new file mode 100644 index 000000000000..08c49067c3c9 --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-arm.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "../annotate.h" +#include "../disasm.h" + +struct arm_annotate { + regex_t call_insn, + jump_insn; +}; + +static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name) +{ + struct arm_annotate *arm = arch->priv; + const struct ins_ops *ops; + regmatch_t match[2]; + + if (!regexec(&arm->call_insn, name, 2, match, 0)) + ops = &call_ops; + else if (!regexec(&arm->jump_insn, name, 2, match, 0)) + ops = &jump_ops; + else + return NULL; + + arch__associate_ins_ops(arch, name, ops); + return ops; +} + +int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + struct arm_annotate *arm; + int err; + + if (arch->initialized) + return 0; + + arm = zalloc(sizeof(*arm)); + if (!arm) + return ENOMEM; + +#define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)" + err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED); + if (err) + goto out_free_arm; + err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED); + if (err) + goto out_free_call; +#undef ARM_CONDS + + arch->initialized = true; + arch->priv = arm; + arch->associate_instruction_ops = arm__associate_instruction_ops; + arch->objdump.comment_char = ';'; + arch->objdump.skip_functions_char = '+'; + return 0; + +out_free_call: + regfree(&arm->call_insn); +out_free_arm: + free(arm); + return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; +} diff --git a/tools/perf/util/annotate-arch/annotate-arm64.c b/tools/perf/util/annotate-arch/annotate-arm64.c new file mode 100644 index 000000000000..d2ea32984b0d --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-arm64.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "../annotate.h" +#include "../disasm.h" + +struct arm64_annotate { + regex_t call_insn, + jump_insn; +}; + +static int arm64_mov__parse(const struct arch *arch __maybe_unused, + struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) +{ + char *s = strchr(ops->raw, ','), *target, *endptr; + + if (s == NULL) + return -1; + + *s = '\0'; + ops->source.raw = strdup(ops->raw); + *s = ','; + + if (ops->source.raw == NULL) + return -1; + + target = ++s; + ops->target.raw = strdup(target); + if (ops->target.raw == NULL) + goto out_free_source; + + ops->target.addr = strtoull(target, &endptr, 16); + if (endptr == target) + goto out_free_target; + + s = strchr(endptr, '<'); + if (s == NULL) + goto out_free_target; + endptr = strchr(s + 1, '>'); + if (endptr == NULL) + goto out_free_target; + + *endptr = '\0'; + *s = ' '; + ops->target.name = strdup(s); + *s = '<'; + *endptr = '>'; + if (ops->target.name == NULL) + goto out_free_target; + + return 0; + +out_free_target: + zfree(&ops->target.raw); +out_free_source: + zfree(&ops->source.raw); + return -1; +} + +static const struct ins_ops arm64_mov_ops = { + .parse = arm64_mov__parse, + .scnprintf = mov__scnprintf, +}; + +static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name) +{ + struct arm64_annotate *arm = arch->priv; + const struct ins_ops *ops; + regmatch_t match[2]; + + if (!regexec(&arm->jump_insn, name, 2, match, 0)) + ops = &jump_ops; + else if (!regexec(&arm->call_insn, name, 2, match, 0)) + ops = &call_ops; + else if (!strcmp(name, "ret")) + ops = &ret_ops; + else + ops = &arm64_mov_ops; + + arch__associate_ins_ops(arch, name, ops); + return ops; +} + +int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + struct arm64_annotate *arm; + int err; + + if (arch->initialized) + return 0; + + arm = zalloc(sizeof(*arm)); + if (!arm) + return ENOMEM; + + /* bl, blr */ + err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED); + if (err) + goto out_free_arm; + /* b, b.cond, br, cbz/cbnz, tbz/tbnz */ + err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$", + REG_EXTENDED); + if (err) + goto out_free_call; + + arch->initialized = true; + arch->priv = arm; + arch->associate_instruction_ops = arm64__associate_instruction_ops; + arch->objdump.comment_char = '/'; + arch->objdump.skip_functions_char = '+'; + return 0; + +out_free_call: + regfree(&arm->call_insn); +out_free_arm: + free(arm); + return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; +} diff --git a/tools/perf/util/annotate-arch/annotate-csky.c b/tools/perf/util/annotate-arch/annotate-csky.c new file mode 100644 index 000000000000..0b0b09b068ec --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-csky.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. +#include +#include +#include "../disasm.h" + +static const struct ins_ops *csky__associate_ins_ops(struct arch *arch, + const char *name) +{ + const struct ins_ops *ops = NULL; + + /* catch all kind of jumps */ + if (!strcmp(name, "bt") || + !strcmp(name, "bf") || + !strcmp(name, "bez") || + !strcmp(name, "bnez") || + !strcmp(name, "bnezad") || + !strcmp(name, "bhsz") || + !strcmp(name, "bhz") || + !strcmp(name, "blsz") || + !strcmp(name, "blz") || + !strcmp(name, "br") || + !strcmp(name, "jmpi") || + !strcmp(name, "jmp")) + ops = &jump_ops; + + /* catch function call */ + if (!strcmp(name, "bsr") || + !strcmp(name, "jsri") || + !strcmp(name, "jsr")) + ops = &call_ops; + + /* catch function return */ + if (!strcmp(name, "rts")) + ops = &ret_ops; + + if (ops) + arch__associate_ins_ops(arch, name, ops); + return ops; +} + +int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + arch->initialized = true; + arch->objdump.comment_char = '/'; + arch->associate_instruction_ops = csky__associate_ins_ops; + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c new file mode 100644 index 000000000000..32df10f6fed5 --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-loongarch.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Perf annotate functions. + * + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ +#include +#include +#include +#include "../disasm.h" +#include "../map.h" +#include "../maps.h" +#include "../symbol.h" + +static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) +{ + char *c, *endptr, *tok, *name; + struct map *map = ms->map; + struct addr_map_symbol target; + + c = strchr(ops->raw, '#'); + if (c++ == NULL) + return -1; + + ops->target.addr = strtoull(c, &endptr, 16); + + name = strchr(endptr, '<'); + name++; + + if (arch->objdump.skip_functions_char && + strchr(name, arch->objdump.skip_functions_char)) + return -1; + + tok = strchr(name, '>'); + if (tok == NULL) + return -1; + + *tok = '\0'; + ops->target.name = strdup(name); + *tok = '>'; + + if (ops->target.name == NULL) + return -1; + + target = (struct addr_map_symbol) { + .ms = { .map = map__get(map), }, + .addr = map__objdump_2mem(map, ops->target.addr), + }; + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + addr_map_symbol__exit(&target); + return 0; +} + +const struct ins_ops loongarch_call_ops = { + .parse = loongarch_call__parse, + .scnprintf = call__scnprintf, +}; + +static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) + +{ + struct map *map = ms->map; + struct symbol *sym = ms->sym; + struct addr_map_symbol target = { + .ms = { .map = map__get(map), }, + }; + const char *c = strchr(ops->raw, '#'); + u64 start, end; + + ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); + ops->jump.raw_func_start = strchr(ops->raw, '<'); + + if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) + c = NULL; + + if (c++ != NULL) + ops->target.addr = strtoull(c, NULL, 16); + else + ops->target.addr = strtoull(ops->raw, NULL, 16); + + target.addr = map__objdump_2mem(map, ops->target.addr); + start = map__unmap_ip(map, sym->start); + end = map__unmap_ip(map, sym->end); + + ops->target.outside = target.addr < start || target.addr > end; + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + if (!ops->target.outside) { + ops->target.offset = target.addr - start; + ops->target.offset_avail = true; + } else { + ops->target.offset_avail = false; + } + addr_map_symbol__exit(&target); + return 0; +} + +const struct ins_ops loongarch_jump_ops = { + .parse = loongarch_jump__parse, + .scnprintf = jump__scnprintf, +}; + +static +const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name) +{ + const struct ins_ops *ops = NULL; + + if (!strcmp(name, "bl")) + ops = &loongarch_call_ops; + else if (!strcmp(name, "jirl")) + ops = &ret_ops; + else if (!strcmp(name, "b") || + !strncmp(name, "beq", 3) || + !strncmp(name, "bne", 3) || + !strncmp(name, "blt", 3) || + !strncmp(name, "bge", 3) || + !strncmp(name, "bltu", 4) || + !strncmp(name, "bgeu", 4)) + ops = &loongarch_jump_ops; + else + return NULL; + + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->associate_instruction_ops = loongarch__associate_ins_ops; + arch->initialized = true; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-mips.c b/tools/perf/util/annotate-arch/annotate-mips.c new file mode 100644 index 000000000000..f14b34ed77d3 --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-mips.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../disasm.h" + +static +const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name) +{ + const struct ins_ops *ops = NULL; + + if (!strncmp(name, "bal", 3) || + !strncmp(name, "bgezal", 6) || + !strncmp(name, "bltzal", 6) || + !strncmp(name, "bgtzal", 6) || + !strncmp(name, "blezal", 6) || + !strncmp(name, "beqzal", 6) || + !strncmp(name, "bnezal", 6) || + !strncmp(name, "bgtzl", 5) || + !strncmp(name, "bltzl", 5) || + !strncmp(name, "bgezl", 5) || + !strncmp(name, "blezl", 5) || + !strncmp(name, "jialc", 5) || + !strncmp(name, "beql", 4) || + !strncmp(name, "bnel", 4) || + !strncmp(name, "jal", 3)) + ops = &call_ops; + else if (!strncmp(name, "jr", 2)) + ops = &ret_ops; + else if (name[0] == 'j' || name[0] == 'b') + ops = &jump_ops; + else + return NULL; + + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->associate_instruction_ops = mips__associate_ins_ops; + arch->initialized = true; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-powerpc.c b/tools/perf/util/annotate-arch/annotate-powerpc.c new file mode 100644 index 000000000000..593c138c8104 --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-powerpc.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "../annotate-data.h" +#include "../debug.h" +#include "../disasm.h" + +#define PPC_OP(op) (((op) >> 26) & 0x3F) +#define PPC_21_30(R) (((R) >> 1) & 0x3ff) +#define PPC_22_30(R) (((R) >> 1) & 0x1ff) + +#define MINUS_EXT_XO_FORM 234 +#define SUB_EXT_XO_FORM 232 +#define ADD_ZERO_EXT_XO_FORM 202 +#define SUB_ZERO_EXT_XO_FORM 200 + +static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, + ops->raw); +} + +/* + * Sets the fields: multi_regs and "mem_ref". + * "mem_ref" is set for ops->source which is later used to + * fill the objdump->memory_ref-char field. This ops is currently + * used by powerpc and since binary instruction code is used to + * extract opcode, regs and offset, no other parsing is needed here. + * + * Dont set multi regs for 4 cases since it has only one operand + * for source: + * - Add to Minus One Extended XO-form ( Ex: addme, addmeo ) + * - Subtract From Minus One Extended XO-form ( Ex: subfme ) + * - Add to Zero Extended XO-form ( Ex: addze, addzeo ) + * - Subtract From Zero Extended XO-form ( Ex: subfze ) + */ +static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, struct disasm_line *dl) +{ + int opcode = PPC_OP(dl->raw.raw_insn); + + ops->source.mem_ref = false; + if (opcode == 31) { + if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) && + (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM)) + ops->source.multi_regs = true; + } + + ops->target.mem_ref = false; + ops->target.multi_regs = false; + + return 0; +} + +static const struct ins_ops arithmetic_ops = { + .parse = arithmetic__parse, + .scnprintf = arithmetic__scnprintf, +}; + +static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, + ops->raw); +} + +/* + * Sets the fields: multi_regs and "mem_ref". + * "mem_ref" is set for ops->source which is later used to + * fill the objdump->memory_ref-char field. This ops is currently + * used by powerpc and since binary instruction code is used to + * extract opcode, regs and offset, no other parsing is needed here + */ +static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) +{ + ops->source.mem_ref = true; + ops->source.multi_regs = false; + /* opcode 31 is of X form */ + if (PPC_OP(dl->raw.raw_insn) == 31) + ops->source.multi_regs = true; + + ops->target.mem_ref = false; + ops->target.multi_regs = false; + + return 0; +} + +static const struct ins_ops load_store_ops = { + .parse = load_store__parse, + .scnprintf = load_store__scnprintf, +}; + +static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name) +{ + int i; + const struct ins_ops *ops; + + /* + * - Interested only if instruction starts with 'b'. + * - Few start with 'b', but aren't branch instructions. + */ + if (name[0] != 'b' || + !strncmp(name, "bcd", 3) || + !strncmp(name, "brinc", 5) || + !strncmp(name, "bper", 4)) + return NULL; + + ops = &jump_ops; + + i = strlen(name) - 1; + if (i < 0) + return NULL; + + /* ignore optional hints at the end of the instructions */ + if (name[i] == '+' || name[i] == '-') + i--; + + if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) { + /* + * if the instruction ends up with 'l' or 'la', then + * those are considered 'calls' since they update LR. + * ... except for 'bnl' which is branch if not less than + * and the absolute form of the same. + */ + if (strcmp(name, "bnl") && strcmp(name, "bnl+") && + strcmp(name, "bnl-") && strcmp(name, "bnla") && + strcmp(name, "bnla+") && strcmp(name, "bnla-")) + ops = &call_ops; + } + if (name[i] == 'r' && name[i-1] == 'l') + /* + * instructions ending with 'lr' are considered to be + * return instructions + */ + ops = &ret_ops; + + arch__associate_ins_ops(arch, name, ops); + return ops; +} + +struct insn_offset { + const char *name; + int value; +}; + +/* + * There are memory instructions with opcode 31 which are + * of X Form, Example: + * ldx RT,RA,RB + * ______________________________________ + * | 31 | RT | RA | RB | 21 |/| + * -------------------------------------- + * 0 6 11 16 21 30 31 + * + * But all instructions with opcode 31 are not memory. + * Example: add RT,RA,RB + * + * Use bits 21 to 30 to check memory insns with 31 as opcode. + * In ins_array below, for ldx instruction: + * name => OP_31_XOP_LDX + * value => 21 + */ + +static struct insn_offset ins_array[] = { + { .name = "OP_31_XOP_LXSIWZX", .value = 12, }, + { .name = "OP_31_XOP_LWARX", .value = 20, }, + { .name = "OP_31_XOP_LDX", .value = 21, }, + { .name = "OP_31_XOP_LWZX", .value = 23, }, + { .name = "OP_31_XOP_LDUX", .value = 53, }, + { .name = "OP_31_XOP_LWZUX", .value = 55, }, + { .name = "OP_31_XOP_LXSIWAX", .value = 76, }, + { .name = "OP_31_XOP_LDARX", .value = 84, }, + { .name = "OP_31_XOP_LBZX", .value = 87, }, + { .name = "OP_31_XOP_LVX", .value = 103, }, + { .name = "OP_31_XOP_LBZUX", .value = 119, }, + { .name = "OP_31_XOP_STXSIWX", .value = 140, }, + { .name = "OP_31_XOP_STDX", .value = 149, }, + { .name = "OP_31_XOP_STWX", .value = 151, }, + { .name = "OP_31_XOP_STDUX", .value = 181, }, + { .name = "OP_31_XOP_STWUX", .value = 183, }, + { .name = "OP_31_XOP_STBX", .value = 215, }, + { .name = "OP_31_XOP_STVX", .value = 231, }, + { .name = "OP_31_XOP_STBUX", .value = 247, }, + { .name = "OP_31_XOP_LHZX", .value = 279, }, + { .name = "OP_31_XOP_LHZUX", .value = 311, }, + { .name = "OP_31_XOP_LXVDSX", .value = 332, }, + { .name = "OP_31_XOP_LWAX", .value = 341, }, + { .name = "OP_31_XOP_LHAX", .value = 343, }, + { .name = "OP_31_XOP_LWAUX", .value = 373, }, + { .name = "OP_31_XOP_LHAUX", .value = 375, }, + { .name = "OP_31_XOP_STHX", .value = 407, }, + { .name = "OP_31_XOP_STHUX", .value = 439, }, + { .name = "OP_31_XOP_LXSSPX", .value = 524, }, + { .name = "OP_31_XOP_LDBRX", .value = 532, }, + { .name = "OP_31_XOP_LSWX", .value = 533, }, + { .name = "OP_31_XOP_LWBRX", .value = 534, }, + { .name = "OP_31_XOP_LFSUX", .value = 567, }, + { .name = "OP_31_XOP_LXSDX", .value = 588, }, + { .name = "OP_31_XOP_LSWI", .value = 597, }, + { .name = "OP_31_XOP_LFDX", .value = 599, }, + { .name = "OP_31_XOP_LFDUX", .value = 631, }, + { .name = "OP_31_XOP_STXSSPX", .value = 652, }, + { .name = "OP_31_XOP_STDBRX", .value = 660, }, + { .name = "OP_31_XOP_STXWX", .value = 661, }, + { .name = "OP_31_XOP_STWBRX", .value = 662, }, + { .name = "OP_31_XOP_STFSX", .value = 663, }, + { .name = "OP_31_XOP_STFSUX", .value = 695, }, + { .name = "OP_31_XOP_STXSDX", .value = 716, }, + { .name = "OP_31_XOP_STSWI", .value = 725, }, + { .name = "OP_31_XOP_STFDX", .value = 727, }, + { .name = "OP_31_XOP_STFDUX", .value = 759, }, + { .name = "OP_31_XOP_LXVW4X", .value = 780, }, + { .name = "OP_31_XOP_LHBRX", .value = 790, }, + { .name = "OP_31_XOP_LXVD2X", .value = 844, }, + { .name = "OP_31_XOP_LFIWAX", .value = 855, }, + { .name = "OP_31_XOP_LFIWZX", .value = 887, }, + { .name = "OP_31_XOP_STXVW4X", .value = 908, }, + { .name = "OP_31_XOP_STHBRX", .value = 918, }, + { .name = "OP_31_XOP_STXVD2X", .value = 972, }, + { .name = "OP_31_XOP_STFIWX", .value = 983, }, +}; + +/* + * Arithmetic instructions which are having opcode as 31. + * These instructions are tracked to save the register state + * changes. Example: + * + * lwz r10,264(r3) + * add r31, r3, r3 + * lwz r9, 0(r31) + * + * Here instruction tracking needs to identify the "add" + * instruction and save data type of r3 to r31. If a sample + * is hit at next "lwz r9, 0(r31)", by this instruction tracking, + * data type of r31 can be resolved. + */ +static struct insn_offset arithmetic_ins_op_31[] = { + { .name = "SUB_CARRY_XO_FORM", .value = 8, }, + { .name = "MUL_HDW_XO_FORM1", .value = 9, }, + { .name = "ADD_CARRY_XO_FORM", .value = 10, }, + { .name = "MUL_HW_XO_FORM1", .value = 11, }, + { .name = "SUB_XO_FORM", .value = 40, }, + { .name = "MUL_HDW_XO_FORM", .value = 73, }, + { .name = "MUL_HW_XO_FORM", .value = 75, }, + { .name = "SUB_EXT_XO_FORM", .value = 136, }, + { .name = "ADD_EXT_XO_FORM", .value = 138, }, + { .name = "SUB_ZERO_EXT_XO_FORM", .value = 200, }, + { .name = "ADD_ZERO_EXT_XO_FORM", .value = 202, }, + { .name = "SUB_EXT_XO_FORM2", .value = 232, }, + { .name = "MUL_DW_XO_FORM", .value = 233, }, + { .name = "ADD_EXT_XO_FORM2", .value = 234, }, + { .name = "MUL_W_XO_FORM", .value = 235, }, + { .name = "ADD_XO_FORM", .value = 266, }, + { .name = "DIV_DW_XO_FORM1", .value = 457, }, + { .name = "DIV_W_XO_FORM1", .value = 459, }, + { .name = "DIV_DW_XO_FORM", .value = 489, }, + { .name = "DIV_W_XO_FORM", .value = 491, }, +}; + +static struct insn_offset arithmetic_two_ops[] = { + { .name = "mulli", .value = 7, }, + { .name = "subfic", .value = 8, }, + { .name = "addic", .value = 12, }, + { .name = "addic.", .value = 13, }, + { .name = "addi", .value = 14, }, + { .name = "addis", .value = 15, }, +}; + +static int cmp_offset(const void *a, const void *b) +{ + const struct insn_offset *val1 = a; + const struct insn_offset *val2 = b; + + return (val1->value - val2->value); +} + +const struct ins_ops *check_ppc_insn(struct disasm_line *dl) +{ + int raw_insn = dl->raw.raw_insn; + int opcode = PPC_OP(raw_insn); + int mem_insn_31 = PPC_21_30(raw_insn); + struct insn_offset *ret; + struct insn_offset mem_insns_31_opcode = { + "OP_31_INSN", + mem_insn_31 + }; + char name_insn[32]; + + /* + * Instructions with opcode 32 to 63 are memory + * instructions in powerpc + */ + if ((opcode & 0x20)) { + /* + * Set name in case of raw instruction to + * opcode to be used in insn-stat + */ + if (!strlen(dl->ins.name)) { + sprintf(name_insn, "%d", opcode); + dl->ins.name = strdup(name_insn); + } + return &load_store_ops; + } else if (opcode == 31) { + /* Check for memory instructions with opcode 31 */ + ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset); + if (ret) { + if (!strlen(dl->ins.name)) + dl->ins.name = strdup(ret->name); + return &load_store_ops; + } else { + mem_insns_31_opcode.value = PPC_22_30(raw_insn); + ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31), + sizeof(arithmetic_ins_op_31[0]), cmp_offset); + if (ret != NULL) + return &arithmetic_ops; + /* Bits 21 to 30 has value 444 for "mr" insn ie, OR X form */ + if (PPC_21_30(raw_insn) == 444) + return &arithmetic_ops; + } + } else { + mem_insns_31_opcode.value = opcode; + ret = bsearch(&mem_insns_31_opcode, arithmetic_two_ops, ARRAY_SIZE(arithmetic_two_ops), + sizeof(arithmetic_two_ops[0]), cmp_offset); + if (ret != NULL) + return &arithmetic_ops; + } + + return NULL; +} + +/* + * Instruction tracking function to track register state moves. + * Example sequence: + * ld r10,264(r3) + * mr r31,r3 + * < + * ld r9,312(r31) + * + * Previous instruction sequence shows that register state of r3 + * is moved to r31. update_insn_state_powerpc tracks these state + * changes + */ +#ifdef HAVE_LIBDW_SUPPORT +static void update_insn_state_powerpc(struct type_state *state, + struct data_loc_info *dloc, Dwarf_Die * cu_die __maybe_unused, + struct disasm_line *dl) +{ + struct annotated_insn_loc loc; + struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; + struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; + struct type_state_reg *tsr; + u32 insn_offset = dl->al.offset; + + if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) + return; + + /* + * Value 444 for bits 21:30 is for "mr" + * instruction. "mr" is extended OR. So set the + * source and destination reg correctly + */ + if (PPC_21_30(dl->raw.raw_insn) == 444) { + int src_reg = src->reg1; + + src->reg1 = dst->reg1; + dst->reg1 = src_reg; + } + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) { + tsr->ok = false; + return; + } + + tsr->type = state->regs[src->reg1].type; + tsr->kind = state->regs[src->reg1].kind; + tsr->ok = true; + + pr_debug_dtp("mov [%x] reg%d -> reg%d", + insn_offset, src->reg1, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); +} +#endif /* HAVE_LIBDW_SUPPORT */ + +int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->initialized = true; + arch->associate_instruction_ops = powerpc__associate_instruction_ops; + arch->objdump.comment_char = '#'; + annotate_opts.show_asm_raw = true; +#ifdef HAVE_LIBDW_SUPPORT + arch->update_insn_state = update_insn_state_powerpc; +#endif + } + + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-riscv64.c b/tools/perf/util/annotate-arch/annotate-riscv64.c new file mode 100644 index 000000000000..15526824037a --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-riscv64.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../disasm.h" + +static +const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name) +{ + const struct ins_ops *ops = NULL; + + if (!strncmp(name, "jal", 3) || + !strncmp(name, "jr", 2) || + !strncmp(name, "call", 4)) + ops = &call_ops; + else if (!strncmp(name, "ret", 3)) + ops = &ret_ops; + else if (name[0] == 'j' || name[0] == 'b') + ops = &jump_ops; + else + return NULL; + + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->associate_instruction_ops = riscv64__associate_ins_ops; + arch->initialized = true; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c new file mode 100644 index 000000000000..81db102b3e15 --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-s390.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../debug.h" +#include "../disasm.h" +#include "../map.h" +#include "../maps.h" +#include "../symbol.h" +#include "../annotate.h" +#include "../annotate-data.h" + +static int s390_call__parse(const struct arch *arch, struct ins_operands *ops, + struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) +{ + char *endptr, *tok, *name; + struct map *map = ms->map; + struct addr_map_symbol target; + + tok = strchr(ops->raw, ','); + if (!tok) + return -1; + + ops->target.addr = strtoull(tok + 1, &endptr, 16); + + name = strchr(endptr, '<'); + if (name == NULL) + return -1; + + name++; + + if (arch->objdump.skip_functions_char && + strchr(name, arch->objdump.skip_functions_char)) + return -1; + + tok = strchr(name, '>'); + if (tok == NULL) + return -1; + + *tok = '\0'; + ops->target.name = strdup(name); + *tok = '>'; + + if (ops->target.name == NULL) + return -1; + + target = (struct addr_map_symbol) { + .ms = { .map = map__get(map), }, + .addr = map__objdump_2mem(map, ops->target.addr), + }; + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + addr_map_symbol__exit(&target); + return 0; +} + +const struct ins_ops s390_call_ops = { + .parse = s390_call__parse, + .scnprintf = call__scnprintf, +}; + +static int s390_mov__parse(const struct arch *arch __maybe_unused, + struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) +{ + char *s = strchr(ops->raw, ','), *target, *endptr; + + if (s == NULL) + return -1; + + *s = '\0'; + ops->source.raw = strdup(ops->raw); + *s = ','; + + if (ops->source.raw == NULL) + return -1; + + target = ++s; + ops->target.raw = strdup(target); + if (ops->target.raw == NULL) + goto out_free_source; + + ops->target.addr = strtoull(target, &endptr, 16); + if (endptr == target) + goto out_free_target; + + s = strchr(endptr, '<'); + if (s == NULL) + goto out_free_target; + endptr = strchr(s + 1, '>'); + if (endptr == NULL) + goto out_free_target; + + *endptr = '\0'; + ops->target.name = strdup(s + 1); + *endptr = '>'; + if (ops->target.name == NULL) + goto out_free_target; + + return 0; + +out_free_target: + zfree(&ops->target.raw); +out_free_source: + zfree(&ops->source.raw); + return -1; +} + + +static const struct ins_ops s390_mov_ops = { + .parse = s390_mov__parse, + .scnprintf = mov__scnprintf, +}; + +static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name) +{ + const struct ins_ops *ops = NULL; + + /* catch all kind of jumps */ + if (strchr(name, 'j') || + !strncmp(name, "bct", 3) || + !strncmp(name, "br", 2)) + ops = &jump_ops; + /* override call/returns */ + if (!strcmp(name, "bras") || + !strcmp(name, "brasl") || + !strcmp(name, "basr")) + ops = &s390_call_ops; + if (!strcmp(name, "br")) + ops = &ret_ops; + /* override load/store relative to PC */ + if (!strcmp(name, "lrl") || + !strcmp(name, "lgrl") || + !strcmp(name, "lgfrl") || + !strcmp(name, "llgfrl") || + !strcmp(name, "strl") || + !strcmp(name, "stgrl")) + ops = &s390_mov_ops; + + if (ops) + arch__associate_ins_ops(arch, name, ops); + return ops; +} + +static int s390__cpuid_parse(struct arch *arch, char *cpuid) +{ + unsigned int family; + char model[16], model_c[16], cpumf_v[16], cpumf_a[16]; + int ret; + + /* + * cpuid string format: + * "IBM,family,model-capacity,model[,cpum_cf-version,cpum_cf-authorization]" + */ + ret = sscanf(cpuid, "%*[^,],%u,%[^,],%[^,],%[^,],%s", &family, model_c, + model, cpumf_v, cpumf_a); + if (ret >= 2) { + arch->family = family; + arch->model = 0; + return 0; + } + + return -1; +} + +int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + int err = 0; + + if (!arch->initialized) { + arch->initialized = true; + arch->associate_instruction_ops = s390__associate_ins_ops; + if (cpuid) { + if (s390__cpuid_parse(arch, cpuid)) + err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + } + arch->objdump.comment_char = '#'; + } + + return err; +} diff --git a/tools/perf/util/annotate-arch/annotate-sparc.c b/tools/perf/util/annotate-arch/annotate-sparc.c new file mode 100644 index 000000000000..66a0174376dd --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-sparc.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../../util/disasm.h" + +static int is_branch_cond(const char *cond) +{ + if (cond[0] == '\0') + return 1; + + if (cond[0] == 'a' && cond[1] == '\0') + return 1; + + if (cond[0] == 'c' && + (cond[1] == 'c' || cond[1] == 's') && + cond[2] == '\0') + return 1; + + if (cond[0] == 'e' && + (cond[1] == '\0' || + (cond[1] == 'q' && cond[2] == '\0'))) + return 1; + + if (cond[0] == 'g' && + (cond[1] == '\0' || + (cond[1] == 't' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'l' && + (cond[1] == '\0' || + (cond[1] == 't' && cond[2] == '\0') || + (cond[1] == 'u' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'n' && + (cond[1] == '\0' || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'z' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'g' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'b' && + cond[1] == 'p' && + cond[2] == 'o' && + cond[3] == 's' && + cond[4] == '\0') + return 1; + + if (cond[0] == 'v' && + (cond[1] == 'c' || cond[1] == 's') && + cond[2] == '\0') + return 1; + + if (cond[0] == 'b' && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + return 0; +} + +static int is_branch_reg_cond(const char *cond) +{ + if ((cond[0] == 'n' || cond[0] == 'l') && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + if (cond[0] == 'z' && + cond[1] == '\0') + return 1; + + if ((cond[0] == 'g' || cond[0] == 'l') && + cond[1] == 'e' && + cond[2] == 'z' && + cond[3] == '\0') + return 1; + + if (cond[0] == 'g' && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + return 0; +} + +static int is_branch_float_cond(const char *cond) +{ + if (cond[0] == '\0') + return 1; + + if ((cond[0] == 'a' || cond[0] == 'e' || + cond[0] == 'z' || cond[0] == 'g' || + cond[0] == 'l' || cond[0] == 'n' || + cond[0] == 'o' || cond[0] == 'u') && + cond[1] == '\0') + return 1; + + if (((cond[0] == 'g' && cond[1] == 'e') || + (cond[0] == 'l' && (cond[1] == 'e' || + cond[1] == 'g')) || + (cond[0] == 'n' && (cond[1] == 'e' || + cond[1] == 'z')) || + (cond[0] == 'u' && (cond[1] == 'e' || + cond[1] == 'g' || + cond[1] == 'l'))) && + cond[2] == '\0') + return 1; + + if (cond[0] == 'u' && + (cond[1] == 'g' || cond[1] == 'l') && + cond[2] == 'e' && + cond[3] == '\0') + return 1; + + return 0; +} + +static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name) +{ + const struct ins_ops *ops = NULL; + + if (!strcmp(name, "call") || + !strcmp(name, "jmp") || + !strcmp(name, "jmpl")) { + ops = &call_ops; + } else if (!strcmp(name, "ret") || + !strcmp(name, "retl") || + !strcmp(name, "return")) { + ops = &ret_ops; + } else if (!strcmp(name, "mov")) { + ops = &mov_ops; + } else { + if (name[0] == 'c' && + (name[1] == 'w' || name[1] == 'x')) + name += 2; + + if (name[0] == 'b') { + const char *cond = name + 1; + + if (cond[0] == 'r') { + if (is_branch_reg_cond(cond + 1)) + ops = &jump_ops; + } else if (is_branch_cond(cond)) { + ops = &jump_ops; + } + } else if (name[0] == 'f' && name[1] == 'b') { + if (is_branch_float_cond(name + 2)) + ops = &jump_ops; + } + } + + if (ops) + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->initialized = true; + arch->associate_instruction_ops = sparc__associate_instruction_ops; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c new file mode 100644 index 000000000000..0c7957fe60da --- /dev/null +++ b/tools/perf/util/annotate-arch/annotate-x86.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "../annotate-data.h" +#include "../debug.h" +#include "../disasm.h" +#include "../dso.h" +#include "../map.h" +#include "../string2.h" // strstarts +#include "../symbol.h" + +/* + * x86 instruction nmemonic table to parse disasm lines for annotate. + * This table is searched twice - one for exact match and another for + * match without a size suffix (b, w, l, q) in case of AT&T syntax. + * + * So this table should not have entries with the suffix unless it's + * a complete different instruction than ones without the suffix. + */ +static const struct ins x86__instructions[] = { + { .name = "adc", .ops = &mov_ops, }, + { .name = "add", .ops = &mov_ops, }, + { .name = "addsd", .ops = &mov_ops, }, + { .name = "and", .ops = &mov_ops, }, + { .name = "andpd", .ops = &mov_ops, }, + { .name = "andps", .ops = &mov_ops, }, + { .name = "bsr", .ops = &mov_ops, }, + { .name = "bt", .ops = &mov_ops, }, + { .name = "btr", .ops = &mov_ops, }, + { .name = "bts", .ops = &mov_ops, }, + { .name = "call", .ops = &call_ops, }, + { .name = "cmovae", .ops = &mov_ops, }, + { .name = "cmovbe", .ops = &mov_ops, }, + { .name = "cmove", .ops = &mov_ops, }, + { .name = "cmp", .ops = &mov_ops, }, + { .name = "cmpxch", .ops = &mov_ops, }, + { .name = "cmpxchg", .ops = &mov_ops, }, + { .name = "cs", .ops = &mov_ops, }, + { .name = "dec", .ops = &dec_ops, }, + { .name = "divsd", .ops = &mov_ops, }, + { .name = "divss", .ops = &mov_ops, }, + { .name = "gs", .ops = &mov_ops, }, + { .name = "imul", .ops = &mov_ops, }, + { .name = "inc", .ops = &dec_ops, }, + { .name = "ja", .ops = &jump_ops, }, + { .name = "jae", .ops = &jump_ops, }, + { .name = "jb", .ops = &jump_ops, }, + { .name = "jbe", .ops = &jump_ops, }, + { .name = "jc", .ops = &jump_ops, }, + { .name = "jcxz", .ops = &jump_ops, }, + { .name = "je", .ops = &jump_ops, }, + { .name = "jecxz", .ops = &jump_ops, }, + { .name = "jg", .ops = &jump_ops, }, + { .name = "jge", .ops = &jump_ops, }, + { .name = "jl", .ops = &jump_ops, }, + { .name = "jle", .ops = &jump_ops, }, + { .name = "jmp", .ops = &jump_ops, }, + { .name = "jna", .ops = &jump_ops, }, + { .name = "jnae", .ops = &jump_ops, }, + { .name = "jnb", .ops = &jump_ops, }, + { .name = "jnbe", .ops = &jump_ops, }, + { .name = "jnc", .ops = &jump_ops, }, + { .name = "jne", .ops = &jump_ops, }, + { .name = "jng", .ops = &jump_ops, }, + { .name = "jnge", .ops = &jump_ops, }, + { .name = "jnl", .ops = &jump_ops, }, + { .name = "jnle", .ops = &jump_ops, }, + { .name = "jno", .ops = &jump_ops, }, + { .name = "jnp", .ops = &jump_ops, }, + { .name = "jns", .ops = &jump_ops, }, + { .name = "jnz", .ops = &jump_ops, }, + { .name = "jo", .ops = &jump_ops, }, + { .name = "jp", .ops = &jump_ops, }, + { .name = "jpe", .ops = &jump_ops, }, + { .name = "jpo", .ops = &jump_ops, }, + { .name = "jrcxz", .ops = &jump_ops, }, + { .name = "js", .ops = &jump_ops, }, + { .name = "jz", .ops = &jump_ops, }, + { .name = "lea", .ops = &mov_ops, }, + { .name = "lock", .ops = &lock_ops, }, + { .name = "mov", .ops = &mov_ops, }, + { .name = "movapd", .ops = &mov_ops, }, + { .name = "movaps", .ops = &mov_ops, }, + { .name = "movdqa", .ops = &mov_ops, }, + { .name = "movdqu", .ops = &mov_ops, }, + { .name = "movsb", .ops = &mov_ops, }, + { .name = "movsd", .ops = &mov_ops, }, + { .name = "movsl", .ops = &mov_ops, }, + { .name = "movss", .ops = &mov_ops, }, + { .name = "movsw", .ops = &mov_ops, }, + { .name = "movupd", .ops = &mov_ops, }, + { .name = "movups", .ops = &mov_ops, }, + { .name = "movzb", .ops = &mov_ops, }, + { .name = "movzl", .ops = &mov_ops, }, + { .name = "movzw", .ops = &mov_ops, }, + { .name = "mulsd", .ops = &mov_ops, }, + { .name = "mulss", .ops = &mov_ops, }, + { .name = "nop", .ops = &nop_ops, }, + { .name = "or", .ops = &mov_ops, }, + { .name = "orps", .ops = &mov_ops, }, + { .name = "paddq", .ops = &mov_ops, }, + { .name = "pand", .ops = &mov_ops, }, + { .name = "pcmpeqb", .ops = &mov_ops, }, + { .name = "por", .ops = &mov_ops, }, + { .name = "rcl", .ops = &mov_ops, }, + { .name = "ret", .ops = &ret_ops, }, + { .name = "sbb", .ops = &mov_ops, }, + { .name = "sete", .ops = &mov_ops, }, + { .name = "sub", .ops = &mov_ops, }, + { .name = "subsd", .ops = &mov_ops, }, + { .name = "test", .ops = &mov_ops, }, + { .name = "tzcnt", .ops = &mov_ops, }, + { .name = "ucomisd", .ops = &mov_ops, }, + { .name = "ucomiss", .ops = &mov_ops, }, + { .name = "vaddsd", .ops = &mov_ops, }, + { .name = "vandpd", .ops = &mov_ops, }, + { .name = "vmovdqa", .ops = &mov_ops, }, + { .name = "vmovq", .ops = &mov_ops, }, + { .name = "vmovsd", .ops = &mov_ops, }, + { .name = "vmulsd", .ops = &mov_ops, }, + { .name = "vorpd", .ops = &mov_ops, }, + { .name = "vsubsd", .ops = &mov_ops, }, + { .name = "vucomisd", .ops = &mov_ops, }, + { .name = "xadd", .ops = &mov_ops, }, + { .name = "xbegin", .ops = &jump_ops, }, + { .name = "xchg", .ops = &mov_ops, }, + { .name = "xor", .ops = &mov_ops, }, + { .name = "xorpd", .ops = &mov_ops, }, + { .name = "xorps", .ops = &mov_ops, }, +}; + +static bool amd__ins_is_fused(const struct arch *arch, const char *ins1, + const char *ins2) +{ + if (strstr(ins2, "jmp")) + return false; + + /* Family >= 15h supports cmp/test + branch fusion */ + if (arch->family >= 0x15 && (strstarts(ins1, "test") || + (strstarts(ins1, "cmp") && !strstr(ins1, "xchg")))) { + return true; + } + + /* Family >= 19h supports some ALU + branch fusion */ + if (arch->family >= 0x19 && (strstarts(ins1, "add") || + strstarts(ins1, "sub") || strstarts(ins1, "and") || + strstarts(ins1, "inc") || strstarts(ins1, "dec") || + strstarts(ins1, "or") || strstarts(ins1, "xor"))) { + return true; + } + + return false; +} + +static bool intel__ins_is_fused(const struct arch *arch, const char *ins1, + const char *ins2) +{ + if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp")) + return false; + + if (arch->model == 0x1e) { + /* Nehalem */ + if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) || + strstr(ins1, "test")) { + return true; + } + } else { + /* Newer platform */ + if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) || + strstr(ins1, "test") || + strstr(ins1, "add") || + strstr(ins1, "sub") || + strstr(ins1, "and") || + strstr(ins1, "inc") || + strstr(ins1, "dec")) { + return true; + } + } + + return false; +} + +static int x86__cpuid_parse(struct arch *arch, char *cpuid) +{ + unsigned int family, model, stepping; + int ret; + + /* + * cpuid = "GenuineIntel,family,model,stepping" + */ + ret = sscanf(cpuid, "%*[^,],%u,%u,%u", &family, &model, &stepping); + if (ret == 3) { + arch->family = family; + arch->model = model; + arch->ins_is_fused = strstarts(cpuid, "AuthenticAMD") ? + amd__ins_is_fused : + intel__ins_is_fused; + return 0; + } + + return -1; +} + +#ifdef HAVE_LIBDW_SUPPORT +static void update_insn_state_x86(struct type_state *state, + struct data_loc_info *dloc, Dwarf_Die *cu_die, + struct disasm_line *dl) +{ + struct annotated_insn_loc loc; + struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; + struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; + struct type_state_reg *tsr; + Dwarf_Die type_die; + u32 insn_offset = dl->al.offset; + int fbreg = dloc->fbreg; + int fboff = 0; + + if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) + return; + + if (ins__is_call(&dl->ins)) { + struct symbol *func = dl->ops.target.sym; + + if (func == NULL) + return; + + /* __fentry__ will preserve all registers */ + if (!strcmp(func->name, "__fentry__")) + return; + + pr_debug_dtp("call [%x] %s\n", insn_offset, func->name); + + /* Otherwise invalidate caller-saved registers after call */ + for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) { + if (state->regs[i].caller_saved) + state->regs[i].ok = false; + } + + /* Update register with the return type (if any) */ + if (die_find_func_rettype(cu_die, func->name, &type_die)) { + tsr = &state->regs[state->ret_reg]; + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("call [%x] return -> reg%d", + insn_offset, state->ret_reg); + pr_debug_type_name(&type_die, tsr->kind); + } + return; + } + + if (!strncmp(dl->ins.name, "add", 3)) { + u64 imm_value = -1ULL; + int offset; + const char *var_name = NULL; + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; + + if (src->imm) + imm_value = src->offset; + else if (has_reg_type(state, src->reg1) && + state->regs[src->reg1].kind == TSR_KIND_CONST) + imm_value = state->regs[src->reg1].imm_value; + else if (src->reg1 == DWARF_REG_PC) { + u64 var_addr = annotate_calc_pcrel(dloc->ms, ip, + src->offset, dl); + + if (get_global_var_info(dloc, var_addr, + &var_name, &offset) && + !strcmp(var_name, "this_cpu_off") && + tsr->kind == TSR_KIND_CONST) { + tsr->kind = TSR_KIND_PERCPU_BASE; + tsr->offset = 0; + tsr->ok = true; + imm_value = tsr->imm_value; + } + } + else + return; + + /* Ignore add to non-pointer or non-const types */ + if (tsr->kind == TSR_KIND_POINTER || + (dwarf_tag(&tsr->type) == DW_TAG_pointer_type && + src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) { + tsr->offset += imm_value; + pr_debug_dtp("add [%x] offset %#"PRIx64" to reg%d", + insn_offset, imm_value, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + + if (tsr->kind == TSR_KIND_CONST) + tsr->imm_value += imm_value; + + if (tsr->kind != TSR_KIND_PERCPU_BASE) + return; + + if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset, + &type_die) && offset == 0) { + /* + * This is not a pointer type, but it should be treated + * as a pointer. + */ + tsr->type = type_die; + tsr->kind = TSR_KIND_PERCPU_POINTER; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", + insn_offset, imm_value, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + return; + } + + if (!strncmp(dl->ins.name, "sub", 3)) { + u64 imm_value = -1ULL; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; + + if (src->imm) + imm_value = src->offset; + else if (has_reg_type(state, src->reg1) && + state->regs[src->reg1].kind == TSR_KIND_CONST) + imm_value = state->regs[src->reg1].imm_value; + + if (tsr->kind == TSR_KIND_POINTER || + (dwarf_tag(&tsr->type) == DW_TAG_pointer_type && + src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) { + tsr->offset -= imm_value; + pr_debug_dtp("sub [%x] offset %#"PRIx64" to reg%d", + insn_offset, imm_value, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + + if (tsr->kind == TSR_KIND_CONST) + tsr->imm_value -= imm_value; + + return; + } + + if (!strncmp(dl->ins.name, "lea", 3)) { + int sreg = src->reg1; + struct type_state_reg src_tsr; + + if (!has_reg_type(state, sreg) || + !has_reg_type(state, dst->reg1) || + !src->mem_ref) + return; + + src_tsr = state->regs[sreg]; + tsr = &state->regs[dst->reg1]; + + tsr->copied_from = -1; + tsr->ok = false; + + /* Case 1: Based on stack pointer or frame pointer */ + if (sreg == fbreg || sreg == state->stack_reg) { + struct type_state_stack *stack; + int offset = src->offset - fboff; + + stack = find_stack_state(state, offset); + if (!stack) + return; + + tsr->type = stack->type; + tsr->kind = TSR_KIND_POINTER; + tsr->offset = offset - stack->offset; + tsr->ok = true; + + if (sreg == fbreg) { + pr_debug_dtp("lea [%x] address of -%#x(stack) -> reg%d", + insn_offset, -src->offset, dst->reg1); + } else { + pr_debug_dtp("lea [%x] address of %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + } + + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Case 2: Based on a register holding a typed pointer */ + else if (src_tsr.ok && (src_tsr.kind == TSR_KIND_POINTER || + (dwarf_tag(&src_tsr.type) == DW_TAG_pointer_type && + src_tsr.kind == TSR_KIND_TYPE))) { + + if (src_tsr.kind == TSR_KIND_TYPE && + __die_get_real_type(&state->regs[sreg].type, &type_die) == NULL) + return; + + if (src_tsr.kind == TSR_KIND_POINTER) + type_die = state->regs[sreg].type; + + /* Check if the target type has a member at the new offset */ + if (die_get_member_type(&type_die, + src->offset + src_tsr.offset, &type_die) == NULL) + return; + + tsr->type = src_tsr.type; + tsr->kind = src_tsr.kind; + tsr->offset = src->offset + src_tsr.offset; + tsr->ok = true; + + pr_debug_dtp("lea [%x] address of %s%#x(reg%d) -> reg%d", + insn_offset, src->offset < 0 ? "-" : "", + abs(src->offset), sreg, dst->reg1); + + pr_debug_type_name(&tsr->type, tsr->kind); + } + return; + } + + /* Invalidate register states for other ops which may change pointers */ + if (has_reg_type(state, dst->reg1) && !dst->mem_ref && + dwarf_tag(&state->regs[dst->reg1].type) == DW_TAG_pointer_type) { + if (!strncmp(dl->ins.name, "imul", 4) || !strncmp(dl->ins.name, "mul", 3) || + !strncmp(dl->ins.name, "idiv", 4) || !strncmp(dl->ins.name, "div", 3) || + !strncmp(dl->ins.name, "shl", 3) || !strncmp(dl->ins.name, "shr", 3) || + !strncmp(dl->ins.name, "sar", 3) || !strncmp(dl->ins.name, "and", 3) || + !strncmp(dl->ins.name, "or", 2) || !strncmp(dl->ins.name, "neg", 3) || + !strncmp(dl->ins.name, "inc", 3) || !strncmp(dl->ins.name, "dec", 3)) { + pr_debug_dtp("%s [%x] invalidate reg%d\n", + dl->ins.name, insn_offset, dst->reg1); + state->regs[dst->reg1].ok = false; + state->regs[dst->reg1].copied_from = -1; + return; + } + + if (!strncmp(dl->ins.name, "xor", 3) && dst->reg1 == src->reg1) { + /* xor reg, reg clears the register */ + pr_debug_dtp("xor [%x] clear reg%d\n", + insn_offset, dst->reg1); + + state->regs[dst->reg1].kind = TSR_KIND_CONST; + state->regs[dst->reg1].imm_value = 0; + state->regs[dst->reg1].ok = true; + state->regs[dst->reg1].copied_from = -1; + return; + } + } + + if (strncmp(dl->ins.name, "mov", 3)) + return; + + if (dloc->fb_cfa) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 pc = map__rip_2objdump(dloc->ms->map, ip); + + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) + fbreg = -1; + } + + /* Case 1. register to register or segment:offset to register transfers */ + if (!src->mem_ref && !dst->mem_ref) { + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; + + if (dso__kernel(map__dso(dloc->ms->map)) && + src->segment == INSN_SEG_X86_GS && src->imm) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr; + int offset; + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + var_addr = src->offset; + + if (var_addr == 40) { + tsr->kind = TSR_KIND_CANARY; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] stack canary -> reg%d\n", + insn_offset, dst->reg1); + return; + } + + if (!get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", + insn_offset, var_addr, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + return; + } + + if (src->imm) { + tsr->kind = TSR_KIND_CONST; + tsr->imm_value = src->offset; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n", + insn_offset, tsr->imm_value, dst->reg1); + return; + } + + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) { + tsr->ok = false; + return; + } + + tsr->type = state->regs[src->reg1].type; + tsr->kind = state->regs[src->reg1].kind; + tsr->imm_value = state->regs[src->reg1].imm_value; + tsr->offset = state->regs[src->reg1].offset; + tsr->ok = true; + + /* To copy back the variable type later (hopefully) */ + if (tsr->kind == TSR_KIND_TYPE || tsr->kind == TSR_KIND_POINTER) + tsr->copied_from = src->reg1; + + pr_debug_dtp("mov [%x] reg%d -> reg%d", + insn_offset, src->reg1, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Case 2. memory to register transers */ + if (src->mem_ref && !dst->mem_ref) { + int sreg = src->reg1; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; + +retry: + /* Check stack variables with offset */ + if (sreg == fbreg || sreg == state->stack_reg) { + struct type_state_stack *stack; + int offset = src->offset - fboff; + + stack = find_stack_state(state, offset); + if (stack == NULL) { + tsr->ok = false; + return; + } else if (!stack->compound) { + tsr->type = stack->type; + tsr->kind = stack->kind; + tsr->offset = stack->ptr_offset; + tsr->ok = true; + } else if (die_get_member_type(&stack->type, + offset - stack->offset, + &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + } else { + tsr->ok = false; + return; + } + + if (sreg == fbreg) { + pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", + insn_offset, -offset, dst->reg1); + } else { + pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", + insn_offset, offset, sreg, dst->reg1); + } + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* And then dereference the pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_TYPE && + die_deref_ptr_type(&state->regs[sreg].type, + src->offset + state->regs[sreg].offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Handle dereference of TSR_KIND_POINTER registers */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_POINTER && + die_get_member_type(&state->regs[sreg].type, + src->offset + state->regs[sreg].offset, &type_die)) { + tsr->type = state->regs[sreg].type; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = src->offset + state->regs[sreg].offset; + tsr->ok = true; + + pr_debug_dtp("mov [%x] addr %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Or check if it's a global variable */ + else if (sreg == DWARF_REG_PC) { + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + u64 addr; + int offset; + + addr = annotate_calc_pcrel(ms, ip, src->offset, dl); + + if (!get_global_var_type(cu_die, dloc, ip, addr, &offset, + &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", + insn_offset, addr, dst->reg1); + pr_debug_type_name(&type_die, tsr->kind); + } + /* And check percpu access with base register */ + else if (has_reg_type(state, sreg) && + state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr = src->offset; + int offset; + + if (src->multi_regs) { + int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + if (get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) && + die_get_member_type(&type_die, offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + + if (src->multi_regs) { + pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d", + insn_offset, src->offset, src->reg1, + src->reg2, dst->reg1); + } else { + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + } + pr_debug_type_name(&tsr->type, tsr->kind); + } else { + tsr->ok = false; + } + } + /* And then dereference the calculated pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_PERCPU_POINTER && + die_get_member_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Or try another register if any */ + else if (src->multi_regs && sreg == src->reg1 && + src->reg1 != src->reg2) { + sreg = src->reg2; + goto retry; + } + else { + int offset; + const char *var_name = NULL; + + /* it might be per-cpu variable (in kernel) access */ + if (src->offset < 0) { + if (get_global_var_info(dloc, (s64)src->offset, + &var_name, &offset) && + !strcmp(var_name, "__per_cpu_offset")) { + tsr->kind = TSR_KIND_PERCPU_BASE; + tsr->offset = 0; + tsr->ok = true; + + pr_debug_dtp("mov [%x] percpu base reg%d\n", + insn_offset, dst->reg1); + return; + } + } + + tsr->ok = false; + } + } + /* Case 3. register to memory transfers */ + if (!src->mem_ref && dst->mem_ref) { + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) + return; + + /* Check stack variables with offset */ + if (dst->reg1 == fbreg || dst->reg1 == state->stack_reg) { + struct type_state_stack *stack; + int offset = dst->offset - fboff; + + tsr = &state->regs[src->reg1]; + + stack = find_stack_state(state, offset); + if (stack) { + /* + * The source register is likely to hold a type + * of member if it's a compound type. Do not + * update the stack variable type since we can + * get the member type later by using the + * die_get_member_type(). + */ + if (!stack->compound) + set_stack_state(stack, offset, tsr->kind, + &tsr->type, tsr->offset); + } else { + findnew_stack_state(state, offset, tsr->kind, + &tsr->type, tsr->offset); + } + + if (dst->reg1 == fbreg) { + pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", + insn_offset, src->reg1, -offset); + } else { + pr_debug_dtp("mov [%x] reg%d -> %#x(reg%d)", + insn_offset, src->reg1, offset, dst->reg1); + } + if (tsr->offset != 0) { + pr_debug_dtp(" reg%d offset %#x ->", + src->reg1, tsr->offset); + } + + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* + * Ignore other transfers since it'd set a value in a struct + * and won't change the type. + */ + } + /* Case 4. memory to memory transfers (not handled for now) */ +} +#endif + +int x86__annotate_init(struct arch *arch, char *cpuid) +{ + int err = 0; + + if (arch->initialized) + return 0; + + if (cpuid) { + if (x86__cpuid_parse(arch, cpuid)) + err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + } + + arch->instructions = x86__instructions; + arch->nr_instructions = ARRAY_SIZE(x86__instructions); +#ifndef NDEBUG + { + static bool sorted_check; + + if (!sorted_check) { + for (size_t i = 0; i < arch->nr_instructions - 1; i++) { + assert(strcmp(arch->instructions[i].name, + arch->instructions[i + 1].name) <= 0); + } + sorted_check = true; + } + } +#endif + arch->sorted_instructions = true; + arch->objdump.comment_char = '#'; + arch->objdump.register_char = '%'; + arch->objdump.memory_ref_char = '('; + arch->objdump.imm_char = '$'; + arch->insn_suffix = "bwlq"; + arch->e_machine = EM_X86_64; + arch->e_flags = 0; + arch->initialized = true; +#ifdef HAVE_LIBDW_SUPPORT + arch->update_insn_state = update_insn_state_x86; +#endif + return err; +} diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index b7523256c4ad..845c2d0f39b1 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -33,20 +33,15 @@ static regex_t file_lineno; /* These can be referred from the arch-dependent code */ -static const struct ins_ops call_ops; -static const struct ins_ops dec_ops; -static const struct ins_ops jump_ops; -static const struct ins_ops mov_ops; -static const struct ins_ops nop_ops; -static const struct ins_ops lock_ops; -static const struct ins_ops ret_ops; -static const struct ins_ops load_store_ops; -static const struct ins_ops arithmetic_ops; - -static int jump__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); -static int call__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); +const struct ins_ops call_ops; +const struct ins_ops dec_ops; +const struct ins_ops jump_ops; +const struct ins_ops mov_ops; +const struct ins_ops nop_ops; +const struct ins_ops lock_ops; +const struct ins_ops ret_ops; +const struct ins_ops load_store_ops; +const struct ins_ops arithmetic_ops; static void ins__sort(struct arch *arch); static int disasm_line__parse(char *line, const char **namep, char **rawp); @@ -86,7 +81,7 @@ grow_from_non_allocated_table: goto out_update_instructions; } -static int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops) +int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops) { struct ins *ins; @@ -106,90 +101,66 @@ static int arch__associate_ins_ops(struct arch *arch, const char *name, const st return 0; } -#include "arch/arc/annotate/instructions.c" -#include "arch/arm/annotate/instructions.c" -#include "arch/arm64/annotate/instructions.c" -#include "arch/csky/annotate/instructions.c" -#include "arch/loongarch/annotate/instructions.c" -#include "arch/mips/annotate/instructions.c" -#include "arch/x86/annotate/instructions.c" -#include "arch/powerpc/annotate/instructions.c" -#include "arch/riscv64/annotate/instructions.c" -#include "arch/s390/annotate/instructions.c" -#include "arch/sparc/annotate/instructions.c" - static struct arch architectures[] = { { .name = "arc", .init = arc__annotate_init, + .e_machine = EM_ARC, }, { .name = "arm", .init = arm__annotate_init, + .e_machine = EM_ARM, }, { .name = "arm64", .init = arm64__annotate_init, + .e_machine = EM_AARCH64, }, { .name = "csky", .init = csky__annotate_init, + .e_machine = EM_CSKY, +#if defined(__CSKYABIV2__) + .e_flags = EF_CSKY_ABIV2, +#else + .e_flags = EF_CSKY_ABIV1, +#endif }, { .name = "mips", .init = mips__annotate_init, - .objdump = { - .comment_char = '#', - }, + .e_machine = EM_MIPS, }, { .name = "x86", .init = x86__annotate_init, - .instructions = x86__instructions, - .nr_instructions = ARRAY_SIZE(x86__instructions), - .sorted_instructions = true, - .insn_suffix = "bwlq", - .objdump = { - .comment_char = '#', - .register_char = '%', - .memory_ref_char = '(', - .imm_char = '$', - }, -#ifdef HAVE_LIBDW_SUPPORT - .update_insn_state = update_insn_state_x86, -#endif + .e_machine = EM_X86_64, // TODO: EM_386 too. }, { .name = "powerpc", .init = powerpc__annotate_init, -#ifdef HAVE_LIBDW_SUPPORT - .update_insn_state = update_insn_state_powerpc, -#endif + .e_machine = EM_PPC, // TODO: EM_PPC64 too. }, { .name = "riscv64", .init = riscv64__annotate_init, + .e_machine = EM_RISCV, }, { .name = "s390", .init = s390__annotate_init, - .objdump = { - .comment_char = '#', - }, + .e_machine = EM_S390, }, { .name = "sparc", .init = sparc__annotate_init, - .objdump = { - .comment_char = '#', - }, + .e_machine = EM_SPARC, }, { .name = "loongarch", .init = loongarch__annotate_init, - .objdump = { - .comment_char = '#', - }, + .e_machine = EM_LOONGARCH, }, }; @@ -248,14 +219,14 @@ static void ins_ops__delete(struct ins_operands *ops) zfree(&ops->target.name); } -static int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) +int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); } -static int ins__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) +int ins__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) { if (ins->ops->scnprintf) return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name); @@ -326,8 +297,8 @@ indirect_call: goto find_target; } -static int call__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) +int call__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) { if (ops->target.sym) return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); @@ -341,7 +312,7 @@ static int call__scnprintf(const struct ins *ins, char *bf, size_t size, return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr); } -static const struct ins_ops call_ops = { +const struct ins_ops call_ops = { .parse = call__parse, .scnprintf = call__scnprintf, }; @@ -453,8 +424,8 @@ static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct return 0; } -static int jump__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) +int jump__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) { const char *c; @@ -494,7 +465,7 @@ static void jump__delete(struct ins_operands *ops __maybe_unused) */ } -static const struct ins_ops jump_ops = { +const struct ins_ops jump_ops = { .free = jump__delete, .parse = jump__parse, .scnprintf = jump__scnprintf, @@ -586,7 +557,7 @@ static void lock__delete(struct ins_operands *ops) zfree(&ops->target.name); } -static const struct ins_ops lock_ops = { +const struct ins_ops lock_ops = { .free = lock__delete, .parse = lock__parse, .scnprintf = lock__scnprintf, @@ -687,103 +658,19 @@ out_free_source: return -1; } -static int mov__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) +int mov__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) { return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name, ops->source.name ?: ops->source.raw, ops->target.name ?: ops->target.raw); } -static const struct ins_ops mov_ops = { +const struct ins_ops mov_ops = { .parse = mov__parse, .scnprintf = mov__scnprintf, }; -#define PPC_22_30(R) (((R) >> 1) & 0x1ff) -#define MINUS_EXT_XO_FORM 234 -#define SUB_EXT_XO_FORM 232 -#define ADD_ZERO_EXT_XO_FORM 202 -#define SUB_ZERO_EXT_XO_FORM 200 - -static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, - ops->raw); -} - -/* - * Sets the fields: multi_regs and "mem_ref". - * "mem_ref" is set for ops->source which is later used to - * fill the objdump->memory_ref-char field. This ops is currently - * used by powerpc and since binary instruction code is used to - * extract opcode, regs and offset, no other parsing is needed here. - * - * Dont set multi regs for 4 cases since it has only one operand - * for source: - * - Add to Minus One Extended XO-form ( Ex: addme, addmeo ) - * - Subtract From Minus One Extended XO-form ( Ex: subfme ) - * - Add to Zero Extended XO-form ( Ex: addze, addzeo ) - * - Subtract From Zero Extended XO-form ( Ex: subfze ) - */ -static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, - struct map_symbol *ms __maybe_unused, struct disasm_line *dl) -{ - int opcode = PPC_OP(dl->raw.raw_insn); - - ops->source.mem_ref = false; - if (opcode == 31) { - if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) \ - && (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM)) - ops->source.multi_regs = true; - } - - ops->target.mem_ref = false; - ops->target.multi_regs = false; - - return 0; -} - -static const struct ins_ops arithmetic_ops = { - .parse = arithmetic__parse, - .scnprintf = arithmetic__scnprintf, -}; - -static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, - ops->raw); -} - -/* - * Sets the fields: multi_regs and "mem_ref". - * "mem_ref" is set for ops->source which is later used to - * fill the objdump->memory_ref-char field. This ops is currently - * used by powerpc and since binary instruction code is used to - * extract opcode, regs and offset, no other parsing is needed here - */ -static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, - struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) -{ - ops->source.mem_ref = true; - ops->source.multi_regs = false; - /* opcode 31 is of X form */ - if (PPC_OP(dl->raw.raw_insn) == 31) - ops->source.multi_regs = true; - - ops->target.mem_ref = false; - ops->target.multi_regs = false; - - return 0; -} - -static const struct ins_ops load_store_ops = { - .parse = load_store__parse, - .scnprintf = load_store__scnprintf, -}; - static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) @@ -820,7 +707,7 @@ static int dec__scnprintf(const struct ins *ins, char *bf, size_t size, ops->target.name ?: ops->target.raw); } -static const struct ins_ops dec_ops = { +const struct ins_ops dec_ops = { .parse = dec__parse, .scnprintf = dec__scnprintf, }; @@ -831,11 +718,11 @@ static int nop__scnprintf(const struct ins *ins __maybe_unused, char *bf, size_t return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); } -static const struct ins_ops nop_ops = { +const struct ins_ops nop_ops = { .scnprintf = nop__scnprintf, }; -static const struct ins_ops ret_ops = { +const struct ins_ops ret_ops = { .scnprintf = ins__raw_scnprintf, }; diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index db7f1ee3d8e7..83503c5075f9 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -109,6 +109,28 @@ const struct arch *arch__find(const char *name); bool arch__is_x86(const struct arch *arch); bool arch__is_powerpc(const struct arch *arch); +extern const struct ins_ops call_ops; +extern const struct ins_ops dec_ops; +extern const struct ins_ops jump_ops; +extern const struct ins_ops mov_ops; +extern const struct ins_ops nop_ops; +extern const struct ins_ops lock_ops; +extern const struct ins_ops ret_ops; + +int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops); + +int arc__annotate_init(struct arch *arch, char *cpuid); +int arm__annotate_init(struct arch *arch, char *cpuid); +int arm64__annotate_init(struct arch *arch, char *cpuid); +int csky__annotate_init(struct arch *arch, char *cpuid); +int loongarch__annotate_init(struct arch *arch, char *cpuid); +int mips__annotate_init(struct arch *arch, char *cpuid); +int powerpc__annotate_init(struct arch *arch, char *cpuid); +int riscv64__annotate_init(struct arch *arch, char *cpuid); +int s390__annotate_init(struct arch *arch, char *cpuid); +int sparc__annotate_init(struct arch *arch, char *cpuid); +int x86__annotate_init(struct arch *arch, char *cpuid); + const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl); bool ins__is_call(const struct ins *ins); @@ -117,12 +139,28 @@ bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2); bool ins__is_ret(const struct ins *ins); bool ins__is_lock(const struct ins *ins); +extern const struct ins_ops s390_call_ops; +extern const struct ins_ops loongarch_call_ops; +extern const struct ins_ops loongarch_jump_ops; +const struct ins_ops *check_ppc_insn(struct disasm_line *dl); + struct disasm_line *disasm_line__new(struct annotate_args *args); void disasm_line__free(struct disasm_line *dl); int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name); +int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +int ins__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +int call__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +int jump__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +int mov__scnprintf(const struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); + int symbol__disassemble(struct symbol *sym, struct annotate_args *args); char *expand_tabs(char *line, char **storage, size_t *storage_len); -- cgit v1.2.3 From 5301cc698821551b34d0b357cf7842984182b35c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:13 -0800 Subject: perf disasm: Refactor ins__is_call/jump to avoid exposing arch ins_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add booleans indicating whether and ins_ops are call or jump and return it. This avoids exposing loongarch and s390 ins_ops for the sake of matching. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-arch/annotate-loongarch.c | 6 ++++-- tools/perf/util/annotate-arch/annotate-s390.c | 3 ++- tools/perf/util/disasm.c | 6 ++++-- tools/perf/util/disasm.h | 5 ++--- 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c index 32df10f6fed5..79dc116ade2f 100644 --- a/tools/perf/util/annotate-arch/annotate-loongarch.c +++ b/tools/perf/util/annotate-arch/annotate-loongarch.c @@ -57,9 +57,10 @@ static int loongarch_call__parse(const struct arch *arch, struct ins_operands *o return 0; } -const struct ins_ops loongarch_call_ops = { +static const struct ins_ops loongarch_call_ops = { .parse = loongarch_call__parse, .scnprintf = call__scnprintf, + .is_call = true, }; static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops, @@ -106,9 +107,10 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o return 0; } -const struct ins_ops loongarch_jump_ops = { +static const struct ins_ops loongarch_jump_ops = { .parse = loongarch_jump__parse, .scnprintf = jump__scnprintf, + .is_jump = true, }; static diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c index 81db102b3e15..7b2d27b62e6b 100644 --- a/tools/perf/util/annotate-arch/annotate-s390.c +++ b/tools/perf/util/annotate-arch/annotate-s390.c @@ -57,9 +57,10 @@ static int s390_call__parse(const struct arch *arch, struct ins_operands *ops, return 0; } -const struct ins_ops s390_call_ops = { +static const struct ins_ops s390_call_ops = { .parse = s390_call__parse, .scnprintf = call__scnprintf, + .is_call = true, }; static int s390_mov__parse(const struct arch *arch __maybe_unused, diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 845c2d0f39b1..8c3e9094600a 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -315,11 +315,12 @@ int call__scnprintf(const struct ins *ins, char *bf, size_t size, const struct ins_ops call_ops = { .parse = call__parse, .scnprintf = call__scnprintf, + .is_call = true, }; bool ins__is_call(const struct ins *ins) { - return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops; + return ins->ops && ins->ops->is_call; } /* @@ -469,11 +470,12 @@ const struct ins_ops jump_ops = { .free = jump__delete, .parse = jump__parse, .scnprintf = jump__scnprintf, + .is_jump = true, }; bool ins__is_jump(const struct ins *ins) { - return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops; + return ins->ops && ins->ops->is_jump; } static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 83503c5075f9..b6a2a30fdf27 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -93,6 +93,8 @@ struct ins_ops { struct disasm_line *dl); int (*scnprintf)(const struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); + bool is_jump; + bool is_call; }; struct annotate_args { @@ -139,9 +141,6 @@ bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2); bool ins__is_ret(const struct ins *ins); bool ins__is_lock(const struct ins *ins); -extern const struct ins_ops s390_call_ops; -extern const struct ins_ops loongarch_call_ops; -extern const struct ins_ops loongarch_jump_ops; const struct ins_ops *check_ppc_insn(struct disasm_line *dl); struct disasm_line *disasm_line__new(struct annotate_args *args); -- cgit v1.2.3 From c4e3a00356fffb20c03bd9609083afb1dc4a2edf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:14 -0800 Subject: perf map_symbol: Switch from holding maps to holding thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit maps may belong to >1 thread. In contexts like symbolization information from the thread may be useful, such as the ELF machine. As the maps can be gained from the thread switch from holding maps in struct map_symbol to holding the thread. Holding the maps in addr_location is also redundant, switch this to using thread__maps. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/ui/browsers/hists.c | 3 ++- tools/perf/util/addr_location.c | 4 ---- tools/perf/util/addr_location.h | 1 - tools/perf/util/annotate-arch/annotate-loongarch.c | 5 +++-- tools/perf/util/annotate-arch/annotate-s390.c | 3 ++- tools/perf/util/callchain.c | 12 +++++++----- tools/perf/util/capstone.c | 6 ++++-- tools/perf/util/db-export.c | 1 - tools/perf/util/disasm.c | 7 ++++--- tools/perf/util/event.c | 2 -- tools/perf/util/hist.c | 18 +++++++++--------- tools/perf/util/machine.c | 13 +++++++------ tools/perf/util/map_symbol.c | 5 +++-- tools/perf/util/map_symbol.h | 3 ++- tools/perf/util/sort.c | 2 +- tools/perf/util/unwind-libdw.c | 2 +- tools/perf/util/unwind-libunwind-local.c | 2 +- 18 files changed, 47 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 3df61cd46652..91ded9c271ee 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -601,7 +601,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser, return true; } - target_ms.maps = ms->maps; + target_ms.thread = ms->thread; target_ms.map = ms->map; target_ms.sym = dl->ops.target.sym; annotation__unlock(notes); diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 08fecbe28a52..cfa6386e6e1d 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -3189,7 +3189,8 @@ do_hotkey: // key came straight from options ui__popup_menu() case 'k': if (browser->selection != NULL) hists_browser__zoom_map(browser, - maps__machine(browser->selection->maps)->vmlinux_map); + maps__machine(thread__maps(browser->selection->thread) + )->vmlinux_map); continue; case 'V': verbose = (verbose + 1) % 4; diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c index 81a0b79c5e10..57e8217a00f9 100644 --- a/tools/perf/util/addr_location.c +++ b/tools/perf/util/addr_location.c @@ -7,7 +7,6 @@ void addr_location__init(struct addr_location *al) { al->thread = NULL; - al->maps = NULL; al->map = NULL; al->sym = NULL; al->srcline = NULL; @@ -30,16 +29,13 @@ void addr_location__exit(struct addr_location *al) { map__zput(al->map); thread__zput(al->thread); - maps__zput(al->maps); } void addr_location__copy(struct addr_location *dst, struct addr_location *src) { thread__put(dst->thread); - maps__put(dst->maps); map__put(dst->map); *dst = *src; dst->thread = thread__get(src->thread); - dst->maps = maps__get(src->maps); dst->map = map__get(src->map); } diff --git a/tools/perf/util/addr_location.h b/tools/perf/util/addr_location.h index 64b551025216..fdc4d3f3a68b 100644 --- a/tools/perf/util/addr_location.h +++ b/tools/perf/util/addr_location.h @@ -11,7 +11,6 @@ struct symbol; struct addr_location { struct thread *thread; - struct maps *maps; struct map *map; struct symbol *sym; const char *srcline; diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c index 79dc116ade2f..6c94cb98a104 100644 --- a/tools/perf/util/annotate-arch/annotate-loongarch.c +++ b/tools/perf/util/annotate-arch/annotate-loongarch.c @@ -11,6 +11,7 @@ #include "../map.h" #include "../maps.h" #include "../symbol.h" +#include "../thread.h" static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, @@ -49,7 +50,7 @@ static int loongarch_call__parse(const struct arch *arch, struct ins_operands *o .addr = map__objdump_2mem(map, ops->target.addr), }; - if (maps__find_ams(ms->maps, &target) == 0 && + if (maps__find_ams(thread__maps(ms->thread), &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; @@ -93,7 +94,7 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o ops->target.outside = target.addr < start || target.addr > end; - if (maps__find_ams(ms->maps, &target) == 0 && + if (maps__find_ams(thread__maps(ms->thread), &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c index 7b2d27b62e6b..47573f0310c1 100644 --- a/tools/perf/util/annotate-arch/annotate-s390.c +++ b/tools/perf/util/annotate-arch/annotate-s390.c @@ -6,6 +6,7 @@ #include "../map.h" #include "../maps.h" #include "../symbol.h" +#include "../thread.h" #include "../annotate.h" #include "../annotate-data.h" @@ -49,7 +50,7 @@ static int s390_call__parse(const struct arch *arch, struct ins_operands *ops, .addr = map__objdump_2mem(map, ops->target.addr), }; - if (maps__find_ams(ms->maps, &target) == 0 && + if (maps__find_ams(thread__maps(ms->thread), &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 428e5350d7a2..515bb8b5da01 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -31,6 +31,7 @@ #include "callchain.h" #include "branch.h" #include "symbol.h" +#include "thread.h" #include "util.h" #include "../perf.h" @@ -1042,7 +1043,7 @@ merge_chain_branch(struct callchain_cursor *cursor, list_for_each_entry_safe(list, next_list, &src->val, list) { struct map_symbol ms = { - .maps = maps__get(list->ms.maps), + .thread = thread__get(list->ms.thread), .map = map__get(list->ms.map), }; callchain_cursor_append(cursor, list->ip, &ms, false, NULL, 0, 0, 0, list->srcline); @@ -1147,10 +1148,11 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node, bool hide_unresolved) { - struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL; + struct machine *machine = NULL; + + if (node->ms.thread) + machine = maps__machine(thread__maps(node->ms.thread)); - maps__put(al->maps); - al->maps = maps__get(node->ms.maps); map__put(al->map); al->map = map__get(node->ms.map); al->sym = node->ms.sym; @@ -1163,7 +1165,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * if (al->map == NULL) goto out; } - if (maps__equal(al->maps, machine__kernel_maps(machine))) { + if (maps__equal(thread__maps(al->thread), machine__kernel_maps(machine))) { if (machine__is_host(machine)) { al->cpumode = PERF_RECORD_MISC_KERNEL; al->level = 'k'; diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c index ce06cfd253ef..9216916f848f 100644 --- a/tools/perf/util/capstone.c +++ b/tools/perf/util/capstone.c @@ -268,7 +268,8 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused, !strcmp(args->options->disassembler_style, "att")) disassembler_style = true; - if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0) + if (capstone_init(maps__machine(thread__maps(args->ms->thread)), &handle, is_64bit, + disassembler_style) < 0) goto err; needs_cs_close = true; @@ -382,7 +383,8 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused, !strcmp(args->options->disassembler_style, "att")) disassembler_style = true; - if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0) + if (capstone_init(maps__machine(thread__maps(args->ms->thread)), &handle, is_64bit, + disassembler_style) < 0) goto err; needs_cs_close = true; diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 8f52e8cefcf3..ae9a9065aab7 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -254,7 +254,6 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, addr_location__init(&al); al.sym = node->ms.sym; al.map = map__get(node->ms.map); - al.maps = maps__get(thread__maps(thread)); al.addr = node->ip; al.thread = thread__get(thread); diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 8c3e9094600a..d81469db0aac 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -28,6 +28,7 @@ #include "namespaces.h" #include "srcline.h" #include "symbol.h" +#include "thread.h" #include "util.h" static regex_t file_lineno; @@ -277,7 +278,7 @@ find_target: .addr = map__objdump_2mem(map, ops->target.addr), }; - if (maps__find_ams(ms->maps, &target) == 0 && + if (maps__find_ams(thread__maps(ms->thread), &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; @@ -411,7 +412,7 @@ static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct * Actual navigation will come next, with further understanding of how * the symbol searching and disassembly should be done. */ - if (maps__find_ams(ms->maps, &target) == 0 && + if (maps__find_ams(thread__maps(ms->thread), &target) == 0 && map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) ops->target.sym = target.ms.sym; @@ -1074,7 +1075,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, .ms = { .map = map__get(map), }, }; - if (!maps__find_ams(args->ms->maps, &target) && + if (!maps__find_ams(thread__maps(args->ms->thread), &target) && target.ms.sym->start == target.al_addr) dl->ops.target.sym = target.ms.sym; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 2dde1044b5a7..bc045fddf7d5 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -698,7 +698,6 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, struct machine *machine = maps__machine(maps); bool load_map = false; - maps__zput(al->maps); map__zput(al->map); thread__zput(al->thread); al->thread = thread__get(thread); @@ -736,7 +735,6 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, return NULL; } - al->maps = maps__get(maps); al->map = maps__find(maps, al->addr); if (al->map != NULL) { /* diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index ef4b569f7df4..7ffaa3d9851b 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -251,7 +251,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) if (h->cgroup) { const char *cgrp_name = "unknown"; - struct cgroup *cgrp = cgroup__find(maps__machine(h->ms.maps)->env, + struct cgroup *cgrp = cgroup__find(maps__machine(thread__maps(h->ms.thread))->env, h->cgroup); if (cgrp != NULL) cgrp_name = cgrp->name; @@ -536,7 +536,7 @@ static int hist_entry__init(struct hist_entry *he, memset(&he->stat, 0, sizeof(he->stat)); } - he->ms.maps = maps__get(he->ms.maps); + he->ms.thread = thread__get(he->ms.thread); he->ms.map = map__get(he->ms.map); if (he->branch_info) { @@ -552,9 +552,9 @@ static int hist_entry__init(struct hist_entry *he, memcpy(he->branch_info, template->branch_info, sizeof(*he->branch_info)); - he->branch_info->from.ms.maps = maps__get(he->branch_info->from.ms.maps); + he->branch_info->from.ms.thread = thread__get(he->branch_info->from.ms.thread); he->branch_info->from.ms.map = map__get(he->branch_info->from.ms.map); - he->branch_info->to.ms.maps = maps__get(he->branch_info->to.ms.maps); + he->branch_info->to.ms.thread = thread__get(he->branch_info->to.ms.thread); he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map); } @@ -810,7 +810,7 @@ __hists__add_entry(struct hists *hists, }, .cgroup = sample->cgroup, .ms = { - .maps = al->maps, + .thread = al->thread, .map = al->map, .sym = al->sym, }, @@ -890,7 +890,7 @@ struct hist_entry *hists__add_entry_block(struct hists *hists, .block_info = block_info, .hists = hists, .ms = { - .maps = al->maps, + .thread = al->thread, .map = al->map, .sym = al->sym, }, @@ -1020,8 +1020,8 @@ iter_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *al) if (iter->curr >= iter->total) return 0; - maps__put(al->maps); - al->maps = maps__get(bi[i].to.ms.maps); + thread__put(al->thread); + al->thread = thread__get(bi[i].to.ms.thread); map__put(al->map); al->map = map__get(bi[i].to.ms.map); al->sym = bi[i].to.ms.sym; @@ -1232,7 +1232,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, .comm = thread__comm(al->thread), .ip = al->addr, .ms = { - .maps = al->maps, + .thread = al->thread, .map = al->map, .sym = al->sym, }, diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 30d606fbf040..5b0f5a48ffd4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2016,7 +2016,7 @@ static void ip__resolve_ams(struct thread *thread, ams->addr = ip; ams->al_addr = al.addr; ams->al_level = al.level; - ams->ms.maps = maps__get(al.maps); + ams->ms.thread = thread__get(al.thread); ams->ms.sym = al.sym; ams->ms.map = map__get(al.map); ams->phys_addr = 0; @@ -2037,7 +2037,7 @@ static void ip__resolve_data(struct thread *thread, ams->addr = addr; ams->al_addr = al.addr; ams->al_level = al.level; - ams->ms.maps = maps__get(al.maps); + ams->ms.thread = thread__get(al.thread); ams->ms.sym = al.sym; ams->ms.map = map__get(al.map); ams->phys_addr = phys_addr; @@ -2120,7 +2120,7 @@ static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms } ilist_ms = (struct map_symbol) { - .maps = maps__get(ms->maps), + .thread = thread__get(ms->thread), .map = map__get(map), }; list_for_each_entry(ilist, &inline_node->val, list) { @@ -2220,7 +2220,7 @@ static int add_callchain_ip(struct thread *thread, iter_cycles = iter->cycles; } - ms.maps = maps__get(al.maps); + ms.thread = thread__get(al.thread); ms.map = map__get(al.map); ms.sym = al.sym; @@ -2383,7 +2383,7 @@ static void save_lbr_cursor_node(struct thread *thread, map_symbol__exit(&lbr_stitch->prev_lbr_cursor[idx].ms); memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr, sizeof(struct callchain_cursor_node)); - lbr_stitch->prev_lbr_cursor[idx].ms.maps = maps__get(cursor->curr->ms.maps); + lbr_stitch->prev_lbr_cursor[idx].ms.thread = thread__get(cursor->curr->ms.thread); lbr_stitch->prev_lbr_cursor[idx].ms.map = map__get(cursor->curr->ms.map); lbr_stitch->prev_lbr_cursor[idx].valid = true; @@ -2596,7 +2596,8 @@ static bool has_stitched_lbr(struct thread *thread, memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i], sizeof(struct callchain_cursor_node)); - stitch_node->cursor.ms.maps = maps__get(lbr_stitch->prev_lbr_cursor[i].ms.maps); + stitch_node->cursor.ms.thread = + thread__get(lbr_stitch->prev_lbr_cursor[i].ms.thread); stitch_node->cursor.ms.map = map__get(lbr_stitch->prev_lbr_cursor[i].ms.map); if (callee) diff --git a/tools/perf/util/map_symbol.c b/tools/perf/util/map_symbol.c index 6ad2960bc289..11bc0a7f704c 100644 --- a/tools/perf/util/map_symbol.c +++ b/tools/perf/util/map_symbol.c @@ -2,10 +2,11 @@ #include "map_symbol.h" #include "maps.h" #include "map.h" +#include "thread.h" void map_symbol__exit(struct map_symbol *ms) { - maps__zput(ms->maps); + thread__zput(ms->thread); map__zput(ms->map); } @@ -16,7 +17,7 @@ void addr_map_symbol__exit(struct addr_map_symbol *ams) void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src) { - dst->maps = maps__get(src->maps); + dst->thread = thread__get(src->thread); dst->map = map__get(src->map); dst->sym = src->sym; } diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h index e370bb32ed47..7437e319f4a3 100644 --- a/tools/perf/util/map_symbol.h +++ b/tools/perf/util/map_symbol.h @@ -4,12 +4,13 @@ #include +struct thread; struct maps; struct map; struct symbol; struct map_symbol { - struct maps *maps; + struct thread *thread; struct map *map; struct symbol *sym; }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index f963d61ac166..01a9d73ae348 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1016,7 +1016,7 @@ static int hist_entry__cgroup_snprintf(struct hist_entry *he, const char *cgrp_name = "N/A"; if (he->cgroup) { - struct cgroup *cgrp = cgroup__find(maps__machine(he->ms.maps)->env, + struct cgroup *cgrp = cgroup__find(maps__machine(thread__maps(he->ms.thread))->env, he->cgroup); if (cgrp != NULL) cgrp_name = cgrp->name; diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index c1646ef5f971..9cb0960ef905 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -161,7 +161,7 @@ static int entry(u64 ip, struct unwind_info *ui) } e->ip = ip; - e->ms.maps = maps__get(al.maps); + e->ms.thread = thread__get(al.thread); e->ms.map = map__get(al.map); e->ms.sym = al.sym; diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index a24b45106acd..ecf0c01fe51f 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -666,7 +666,7 @@ static int entry(u64 ip, struct thread *thread, e.ms.sym = thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al); e.ip = ip; e.ms.map = al.map; - e.ms.maps = al.maps; + e.ms.thread = thread__get(al.thread); pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n", al.sym ? al.sym->name : "''", -- cgit v1.2.3 From 0e26ba5a87744ee8957cc1f341e403c0fd758398 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:15 -0800 Subject: perf disasm: Refactor arch__find and initialization of arch structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch arch__find to using an ELF machine number rather than a string. Rather than an array of fixed size arch structs turn the init functions into new functions indexed by the ELF machine they correspond to. This allows data to be stored with a struct arch with the container_of trick, so the priv variable can be removed. Switch to using the thread to find the arch rather than the evsel as the evsel only has limited notions of the running thread upon which disassembly is performed. Factor out the e_machine and e_flags into their own struct to make them easier to pass around. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li [ Include elf.h for EM_CSKY and friends and also conditionally define EM_CSKY_ABIMASK for old distros ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate-arch/annotate-arc.c | 14 +- tools/perf/util/annotate-arch/annotate-arm.c | 39 ++--- tools/perf/util/annotate-arch/annotate-arm64.c | 39 ++--- tools/perf/util/annotate-arch/annotate-csky.c | 14 +- tools/perf/util/annotate-arch/annotate-loongarch.c | 19 ++- tools/perf/util/annotate-arch/annotate-mips.c | 19 ++- tools/perf/util/annotate-arch/annotate-powerpc.c | 24 ++-- tools/perf/util/annotate-arch/annotate-riscv64.c | 19 ++- tools/perf/util/annotate-arch/annotate-s390.c | 29 ++-- tools/perf/util/annotate-arch/annotate-sparc.c | 19 ++- tools/perf/util/annotate-arch/annotate-x86.c | 24 ++-- tools/perf/util/annotate.c | 46 +++--- tools/perf/util/annotate.h | 2 +- tools/perf/util/disasm.c | 157 ++++++++++----------- tools/perf/util/disasm.h | 59 ++++---- 16 files changed, 283 insertions(+), 242 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 91ded9c271ee..ea17e6d29a7e 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -1198,7 +1198,7 @@ int __hist_entry__tui_annotate(struct hist_entry *he, struct map_symbol *ms, ui__warning("Annotation has no source code."); } } else { - err = evsel__get_arch(evsel, &browser.arch); + err = thread__get_arch(ms->thread, &browser.arch); if (err) { annotate_browser__symbol_annotate_error(&browser, err); return -1; diff --git a/tools/perf/util/annotate-arch/annotate-arc.c b/tools/perf/util/annotate-arch/annotate-arc.c index d7ca08ca5600..170103e383a4 100644 --- a/tools/perf/util/annotate-arch/annotate-arc.c +++ b/tools/perf/util/annotate-arch/annotate-arc.c @@ -1,10 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include "../disasm.h" -int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_arc(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - arch->initialized = true; + struct arch *arch = zalloc(sizeof(*arch)); + + if (!arch) + return NULL; + + arch->name = "arc"; + arch->id = *id; arch->objdump.comment_char = ';'; - return 0; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-arm.c b/tools/perf/util/annotate-arch/annotate-arm.c index 08c49067c3c9..afb413c80156 100644 --- a/tools/perf/util/annotate-arch/annotate-arm.c +++ b/tools/perf/util/annotate-arch/annotate-arm.c @@ -7,14 +7,15 @@ #include "../annotate.h" #include "../disasm.h" -struct arm_annotate { - regex_t call_insn, - jump_insn; +struct arch_arm { + struct arch arch; + regex_t call_insn; + regex_t jump_insn; }; static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name) { - struct arm_annotate *arm = arch->priv; + struct arch_arm *arm = container_of(arch, struct arch_arm, arch); const struct ins_ops *ops; regmatch_t match[2]; @@ -29,37 +30,39 @@ static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, c return ops; } -int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_arm(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - struct arm_annotate *arm; int err; + struct arch_arm *arm = zalloc(sizeof(*arm)); + struct arch *arch; - if (arch->initialized) - return 0; - - arm = zalloc(sizeof(*arm)); if (!arm) - return ENOMEM; + return NULL; + + arch = &arm->arch; + arch->name = "arm"; + arch->id = *id; + arch->objdump.comment_char = ';'; + arch->objdump.skip_functions_char = '+'; + arch->associate_instruction_ops = arm__associate_instruction_ops; #define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)" err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED); if (err) goto out_free_arm; + err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED); if (err) goto out_free_call; #undef ARM_CONDS - arch->initialized = true; - arch->priv = arm; - arch->associate_instruction_ops = arm__associate_instruction_ops; - arch->objdump.comment_char = ';'; - arch->objdump.skip_functions_char = '+'; - return 0; + return arch; out_free_call: regfree(&arm->call_insn); out_free_arm: free(arm); - return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; + errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; + return NULL; } diff --git a/tools/perf/util/annotate-arch/annotate-arm64.c b/tools/perf/util/annotate-arch/annotate-arm64.c index d2ea32984b0d..33080fdca125 100644 --- a/tools/perf/util/annotate-arch/annotate-arm64.c +++ b/tools/perf/util/annotate-arch/annotate-arm64.c @@ -8,9 +8,10 @@ #include "../annotate.h" #include "../disasm.h" -struct arm64_annotate { - regex_t call_insn, - jump_insn; +struct arch_arm64 { + struct arch arch; + regex_t call_insn; + regex_t jump_insn; }; static int arm64_mov__parse(const struct arch *arch __maybe_unused, @@ -70,7 +71,7 @@ static const struct ins_ops arm64_mov_ops = { static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name) { - struct arm64_annotate *arm = arch->priv; + struct arch_arm64 *arm = container_of(arch, struct arch_arm64, arch); const struct ins_ops *ops; regmatch_t match[2]; @@ -87,38 +88,40 @@ static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, return ops; } -int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_arm64(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - struct arm64_annotate *arm; int err; + struct arch_arm64 *arm = zalloc(sizeof(*arm)); + struct arch *arch; - if (arch->initialized) - return 0; - - arm = zalloc(sizeof(*arm)); if (!arm) - return ENOMEM; + return NULL; + + arch = &arm->arch; + arch->name = "arm64"; + arch->id = *id; + arch->objdump.comment_char = '/'; + arch->objdump.skip_functions_char = '+'; + arch->associate_instruction_ops = arm64__associate_instruction_ops; /* bl, blr */ err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED); if (err) goto out_free_arm; + /* b, b.cond, br, cbz/cbnz, tbz/tbnz */ err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$", REG_EXTENDED); if (err) goto out_free_call; - arch->initialized = true; - arch->priv = arm; - arch->associate_instruction_ops = arm64__associate_instruction_ops; - arch->objdump.comment_char = '/'; - arch->objdump.skip_functions_char = '+'; - return 0; + return arch; out_free_call: regfree(&arm->call_insn); out_free_arm: free(arm); - return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; + errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; + return NULL; } diff --git a/tools/perf/util/annotate-arch/annotate-csky.c b/tools/perf/util/annotate-arch/annotate-csky.c index 0b0b09b068ec..d2b18e4ea2c9 100644 --- a/tools/perf/util/annotate-arch/annotate-csky.c +++ b/tools/perf/util/annotate-arch/annotate-csky.c @@ -2,6 +2,7 @@ // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. #include #include +#include #include "../disasm.h" static const struct ins_ops *csky__associate_ins_ops(struct arch *arch, @@ -39,10 +40,17 @@ static const struct ins_ops *csky__associate_ins_ops(struct arch *arch, return ops; } -int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_csky(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - arch->initialized = true; + struct arch *arch = zalloc(sizeof(*arch)); + + if (!arch) + return NULL; + + arch->name = "csky"; + arch->id = *id; arch->objdump.comment_char = '/'; arch->associate_instruction_ops = csky__associate_ins_ops; - return 0; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c index 6c94cb98a104..3aeab453a059 100644 --- a/tools/perf/util/annotate-arch/annotate-loongarch.c +++ b/tools/perf/util/annotate-arch/annotate-loongarch.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "../disasm.h" #include "../map.h" #include "../maps.h" @@ -139,13 +140,17 @@ const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char return ops; } -int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_loongarch(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - if (!arch->initialized) { - arch->associate_instruction_ops = loongarch__associate_ins_ops; - arch->initialized = true; - arch->objdump.comment_char = '#'; - } + struct arch *arch = zalloc(sizeof(*arch)); - return 0; + if (!arch) + return NULL; + + arch->name = "loongarch"; + arch->id = *id; + arch->associate_instruction_ops = loongarch__associate_ins_ops; + arch->objdump.comment_char = '#'; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-mips.c b/tools/perf/util/annotate-arch/annotate-mips.c index f14b34ed77d3..e8d1c6c7e9f3 100644 --- a/tools/perf/util/annotate-arch/annotate-mips.c +++ b/tools/perf/util/annotate-arch/annotate-mips.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "../disasm.h" static @@ -36,13 +37,17 @@ const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *nam return ops; } -int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_mips(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - if (!arch->initialized) { - arch->associate_instruction_ops = mips__associate_ins_ops; - arch->initialized = true; - arch->objdump.comment_char = '#'; - } + struct arch *arch = zalloc(sizeof(*arch)); - return 0; + if (!arch) + return NULL; + + arch->name = "mips"; + arch->id = *id; + arch->objdump.comment_char = '#'; + arch->associate_instruction_ops = mips__associate_ins_ops; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-powerpc.c b/tools/perf/util/annotate-arch/annotate-powerpc.c index 593c138c8104..218207b52581 100644 --- a/tools/perf/util/annotate-arch/annotate-powerpc.c +++ b/tools/perf/util/annotate-arch/annotate-powerpc.c @@ -390,17 +390,21 @@ static void update_insn_state_powerpc(struct type_state *state, } #endif /* HAVE_LIBDW_SUPPORT */ -int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_powerpc(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - if (!arch->initialized) { - arch->initialized = true; - arch->associate_instruction_ops = powerpc__associate_instruction_ops; - arch->objdump.comment_char = '#'; - annotate_opts.show_asm_raw = true; + struct arch *arch = zalloc(sizeof(*arch)); + + if (!arch) + return NULL; + + arch->name = "powerpc"; + arch->id = *id; + arch->objdump.comment_char = '#'; + annotate_opts.show_asm_raw = true; + arch->associate_instruction_ops = powerpc__associate_instruction_ops; #ifdef HAVE_LIBDW_SUPPORT - arch->update_insn_state = update_insn_state_powerpc; + arch->update_insn_state = update_insn_state_powerpc; #endif - } - - return 0; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-riscv64.c b/tools/perf/util/annotate-arch/annotate-riscv64.c index 15526824037a..29a988fca8c9 100644 --- a/tools/perf/util/annotate-arch/annotate-riscv64.c +++ b/tools/perf/util/annotate-arch/annotate-riscv64.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "../disasm.h" static @@ -24,13 +25,17 @@ const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char * return ops; } -int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_riscv64(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - if (!arch->initialized) { - arch->associate_instruction_ops = riscv64__associate_ins_ops; - arch->initialized = true; - arch->objdump.comment_char = '#'; - } + struct arch *arch = zalloc(sizeof(*arch)); - return 0; + if (!arch) + return NULL; + + arch->name = "riscv"; + arch->id = *id; + arch->objdump.comment_char = '#'; + arch->associate_instruction_ops = riscv64__associate_ins_ops; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c index 47573f0310c1..af9cabd0a586 100644 --- a/tools/perf/util/annotate-arch/annotate-s390.c +++ b/tools/perf/util/annotate-arch/annotate-s390.c @@ -148,7 +148,7 @@ static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const ch return ops; } -static int s390__cpuid_parse(struct arch *arch, char *cpuid) +static int s390__cpuid_parse(struct arch *arch, const char *cpuid) { unsigned int family; char model[16], model_c[16], cpumf_v[16], cpumf_a[16]; @@ -169,19 +169,22 @@ static int s390__cpuid_parse(struct arch *arch, char *cpuid) return -1; } -int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_s390(const struct e_machine_and_e_flags *id, const char *cpuid) { - int err = 0; - - if (!arch->initialized) { - arch->initialized = true; - arch->associate_instruction_ops = s390__associate_ins_ops; - if (cpuid) { - if (s390__cpuid_parse(arch, cpuid)) - err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + struct arch *arch = zalloc(sizeof(*arch)); + + if (!arch) + return NULL; + + arch->name = "s390"; + arch->id = *id; + arch->associate_instruction_ops = s390__associate_ins_ops; + if (cpuid) { + if (s390__cpuid_parse(arch, cpuid)) { + errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + return NULL; } - arch->objdump.comment_char = '#'; } - - return err; + arch->objdump.comment_char = '#'; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-sparc.c b/tools/perf/util/annotate-arch/annotate-sparc.c index 66a0174376dd..2f07bc7a56dd 100644 --- a/tools/perf/util/annotate-arch/annotate-sparc.c +++ b/tools/perf/util/annotate-arch/annotate-sparc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "../../util/disasm.h" static int is_branch_cond(const char *cond) @@ -160,13 +161,17 @@ static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, return ops; } -int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +const struct arch *arch__new_sparc(const struct e_machine_and_e_flags *id, + const char *cpuid __maybe_unused) { - if (!arch->initialized) { - arch->initialized = true; - arch->associate_instruction_ops = sparc__associate_instruction_ops; - arch->objdump.comment_char = '#'; - } + struct arch *arch = zalloc(sizeof(*arch)); - return 0; + if (!arch) + return NULL; + + arch->name = "sparc"; + arch->id = *id; + arch->associate_instruction_ops = sparc__associate_instruction_ops; + arch->objdump.comment_char = '#'; + return arch; } diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c index 0c7957fe60da..eb9a649ca656 100644 --- a/tools/perf/util/annotate-arch/annotate-x86.c +++ b/tools/perf/util/annotate-arch/annotate-x86.c @@ -182,7 +182,7 @@ static bool intel__ins_is_fused(const struct arch *arch, const char *ins1, return false; } -static int x86__cpuid_parse(struct arch *arch, char *cpuid) +static int x86__cpuid_parse(struct arch *arch, const char *cpuid) { unsigned int family, model, stepping; int ret; @@ -777,18 +777,21 @@ retry: } #endif -int x86__annotate_init(struct arch *arch, char *cpuid) +const struct arch *arch__new_x86(const struct e_machine_and_e_flags *id, const char *cpuid) { - int err = 0; + struct arch *arch = zalloc(sizeof(*arch)); - if (arch->initialized) - return 0; + if (!arch) + return NULL; + arch->name = "x86"; + arch->id = *id; if (cpuid) { - if (x86__cpuid_parse(arch, cpuid)) - err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + if (x86__cpuid_parse(arch, cpuid)) { + errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + return NULL; + } } - arch->instructions = x86__instructions; arch->nr_instructions = ARRAY_SIZE(x86__instructions); #ifndef NDEBUG @@ -810,11 +813,8 @@ int x86__annotate_init(struct arch *arch, char *cpuid) arch->objdump.memory_ref_char = '('; arch->objdump.imm_char = '$'; arch->insn_suffix = "bwlq"; - arch->e_machine = EM_X86_64; - arch->e_flags = 0; - arch->initialized = true; #ifdef HAVE_LIBDW_SUPPORT arch->update_insn_state = update_insn_state_x86; #endif - return err; + return arch; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 79702072568b..c16c6dfaa959 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -980,32 +980,27 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel) annotation__calc_percent(notes, evsel, symbol__size(sym)); } -int evsel__get_arch(struct evsel *evsel, const struct arch **parch) +int thread__get_arch(struct thread *thread, const struct arch **parch) { - struct perf_env *env = evsel__env(evsel); - const char *arch_name = perf_env__arch(env); const struct arch *arch; - int err; + struct machine *machine; + uint16_t e_machine; - if (!arch_name) { + if (!thread) { *parch = NULL; - return errno; + return -1; } - *parch = arch = arch__find(arch_name); + machine = maps__machine(thread__maps(thread)); + e_machine = thread__e_machine(thread, machine); + arch = arch__find(e_machine, machine->env ? machine->env->cpuid : NULL); if (arch == NULL) { - pr_err("%s: unsupported arch %s\n", __func__, arch_name); - return ENOTSUP; + pr_err("%s: unsupported arch %d\n", __func__, e_machine); + return errno; } + if (parch) + *parch = arch; - if (arch->init) { - err = arch->init((struct arch *)arch, env ? env->cpuid : NULL); - if (err) { - pr_err("%s: failed to initialize %s arch priv area\n", - __func__, arch->name); - return err; - } - } return 0; } @@ -1020,7 +1015,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, const struct arch *arch = NULL; int err, nr; - err = evsel__get_arch(evsel, &arch); + err = thread__get_arch(ms->thread, &arch); if (err) return err; @@ -1268,7 +1263,7 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel) apd.addr_fmt_width = annotated_source__addr_fmt_width(¬es->src->source, notes->src->start); - evsel__get_arch(evsel, &apd.arch); + thread__get_arch(ms->thread, &apd.arch); apd.dbg = dso__debuginfo(dso); list_for_each_entry(pos, ¬es->src->source, node) { @@ -1373,7 +1368,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp, struct annotation_line *al; if (annotate_opts.code_with_type) { - evsel__get_arch(apd->evsel, &apd->arch); + thread__get_arch(apd->he->ms.thread, &apd->arch); apd->dbg = dso__debuginfo(map__dso(apd->he->ms.map)); } @@ -2495,7 +2490,7 @@ static int extract_reg_offset(const struct arch *arch, const char *str, if (regname == NULL) return -1; - op_loc->reg1 = get_dwarf_regnum(regname, arch->e_machine, arch->e_flags); + op_loc->reg1 = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags); free(regname); /* Get the second register */ @@ -2508,7 +2503,7 @@ static int extract_reg_offset(const struct arch *arch, const char *str, if (regname == NULL) return -1; - op_loc->reg2 = get_dwarf_regnum(regname, arch->e_machine, arch->e_flags); + op_loc->reg2 = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags); free(regname); } return 0; @@ -2607,8 +2602,11 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl, if (s == NULL) return -1; - if (*s == arch->objdump.register_char) - op_loc->reg1 = get_dwarf_regnum(s, arch->e_machine, arch->e_flags); + if (*s == arch->objdump.register_char) { + op_loc->reg1 = get_dwarf_regnum(s, + arch->id.e_machine, + arch->id.e_flags); + } else if (*s == arch->objdump.imm_char) { op_loc->offset = strtol(s + 1, &p, 0); if (p && p != s + 1) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 58eaf4b2fa65..696e36dbf013 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -586,5 +586,5 @@ int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr, int num_aggr, struct evsel *evsel); int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header); -int evsel__get_arch(struct evsel *evsel, const struct arch **parch); +int thread__get_arch(struct thread *thread, const struct arch **parch); #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index d81469db0aac..4f60726247d6 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include +#ifndef EF_CSKY_ABIMASK +#define EF_CSKY_ABIMASK 0XF0000000 +#endif #include #include #include @@ -102,112 +106,101 @@ int arch__associate_ins_ops(struct arch *arch, const char *name, const struct in return 0; } -static struct arch architectures[] = { - { - .name = "arc", - .init = arc__annotate_init, - .e_machine = EM_ARC, - }, - { - .name = "arm", - .init = arm__annotate_init, - .e_machine = EM_ARM, - }, - { - .name = "arm64", - .init = arm64__annotate_init, - .e_machine = EM_AARCH64, - }, - { - .name = "csky", - .init = csky__annotate_init, - .e_machine = EM_CSKY, -#if defined(__CSKYABIV2__) - .e_flags = EF_CSKY_ABIV2, -#else - .e_flags = EF_CSKY_ABIV1, -#endif - }, - { - .name = "mips", - .init = mips__annotate_init, - .e_machine = EM_MIPS, - }, - { - .name = "x86", - .init = x86__annotate_init, - .e_machine = EM_X86_64, // TODO: EM_386 too. - }, - { - .name = "powerpc", - .init = powerpc__annotate_init, - .e_machine = EM_PPC, // TODO: EM_PPC64 too. - }, - { - .name = "riscv64", - .init = riscv64__annotate_init, - .e_machine = EM_RISCV, - }, - { - .name = "s390", - .init = s390__annotate_init, - .e_machine = EM_S390, - }, - { - .name = "sparc", - .init = sparc__annotate_init, - .e_machine = EM_SPARC, - }, - { - .name = "loongarch", - .init = loongarch__annotate_init, - .e_machine = EM_LOONGARCH, - }, -}; +static int e_machine_and_eflags__cmp(const struct e_machine_and_e_flags *val1, + const struct e_machine_and_e_flags *val2) +{ + if (val1->e_machine == val2->e_machine) { + if (val1->e_machine != EM_CSKY) + return 0; + if ((val1->e_flags & EF_CSKY_ABIMASK) < (val2->e_flags & EF_CSKY_ABIMASK)) + return -1; + return (val1->e_flags & EF_CSKY_ABIMASK) > (val2->e_flags & EF_CSKY_ABIMASK); + } + return val1->e_machine < val2->e_machine ? -1 : 1; +} -static int arch__key_cmp(const void *name, const void *archp) +static int arch__key_cmp(const void *key, const void *archp) { - const struct arch *arch = archp; + const struct arch *const *arch = archp; - return strcmp(name, arch->name); + return e_machine_and_eflags__cmp(key, &(*arch)->id); } static int arch__cmp(const void *a, const void *b) { - const struct arch *aa = a; - const struct arch *ab = b; + const struct arch *const *aa = a; + const struct arch *const *ab = b; - return strcmp(aa->name, ab->name); + return e_machine_and_eflags__cmp(&(*aa)->id, &(*ab)->id); } -static void arch__sort(void) +const struct arch *arch__find(uint16_t e_machine, const char *cpuid) { - const int nmemb = ARRAY_SIZE(architectures); + static const struct arch *(*const arch_new_fn[])(const struct e_machine_and_e_flags *id, + const char *cpuid) = { + [EM_386] = arch__new_x86, + [EM_ARC] = arch__new_arc, + [EM_ARM] = arch__new_arm, + [EM_AARCH64] = arch__new_arm64, + [EM_CSKY] = arch__new_csky, + [EM_LOONGARCH] = arch__new_loongarch, + [EM_MIPS] = arch__new_mips, + [EM_PPC64] = arch__new_powerpc, + [EM_PPC] = arch__new_powerpc, + [EM_RISCV] = arch__new_riscv64, + [EM_S390] = arch__new_s390, + [EM_SPARC] = arch__new_sparc, + [EM_SPARCV9] = arch__new_sparc, + [EM_X86_64] = arch__new_x86, + }; + static const struct arch **archs; + static size_t num_archs; + struct e_machine_and_e_flags key = { + .e_machine = e_machine, + // TODO: e_flags should really come from the same source as e_machine. + .e_flags = EF_HOST, + }; + const struct arch *result = NULL, **tmp; - qsort(architectures, nmemb, sizeof(struct arch), arch__cmp); -} + if (num_archs > 0) { + tmp = bsearch(&key, archs, num_archs, sizeof(*archs), arch__key_cmp); + if (tmp) + result = *tmp; + } -const struct arch *arch__find(const char *name) -{ - const int nmemb = ARRAY_SIZE(architectures); - static bool sorted; + if (result) + return result; - if (!sorted) { - arch__sort(); - sorted = true; + if (e_machine >= ARRAY_SIZE(arch_new_fn) || arch_new_fn[e_machine] == NULL) { + errno = ENOTSUP; + return NULL; } - return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); + tmp = reallocarray(archs, num_archs + 1, sizeof(*archs)); + if (!tmp) + return NULL; + + result = arch_new_fn[e_machine](&key, cpuid); + if (!result) { + pr_err("%s: failed to initialize %s (%u) arch priv area\n", + __func__, result->name, e_machine); + free(tmp); + return NULL; + } + archs = tmp; + archs[num_archs++] = result; + qsort(archs, num_archs, sizeof(*archs), arch__cmp); + return result; } bool arch__is_x86(const struct arch *arch) { - return arch->e_machine == EM_386 || arch->e_machine == EM_X86_64; + return arch->id.e_machine == EM_386 || arch->id.e_machine == EM_X86_64; } bool arch__is_powerpc(const struct arch *arch) { - return arch->e_machine == EM_PPC || arch->e_machine == EM_PPC64; + return arch->id.e_machine == EM_PPC || arch->id.e_machine == EM_PPC64; } static void ins_ops__delete(struct ins_operands *ops) diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index b6a2a30fdf27..2793d48aa04e 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -17,21 +17,23 @@ struct data_loc_info; struct type_state; struct disasm_line; +struct e_machine_and_e_flags { + uint16_t e_machine; + uint32_t e_flags; +}; + struct arch { - const char *name; + /** @id: ELF machine and flags associated with arch. */ + struct e_machine_and_e_flags id; + /** @name: name such as "x86" or "powerpc". */ + const char *name; const struct ins *instructions; - size_t nr_instructions; - size_t nr_instructions_allocated; - const struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); - bool sorted_instructions; - bool initialized; - const char *insn_suffix; - void *priv; - unsigned int model; - unsigned int family; - int (*init)(struct arch *arch, char *cpuid); - bool (*ins_is_fused)(const struct arch *arch, const char *ins1, - const char *ins2); + size_t nr_instructions; + size_t nr_instructions_allocated; + bool sorted_instructions; + const char *insn_suffix; + unsigned int model; + unsigned int family; struct { char comment_char; char skip_functions_char; @@ -39,15 +41,14 @@ struct arch { char memory_ref_char; char imm_char; } objdump; + bool (*ins_is_fused)(const struct arch *arch, const char *ins1, + const char *ins2); + const struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); #ifdef HAVE_LIBDW_SUPPORT void (*update_insn_state)(struct type_state *state, struct data_loc_info *dloc, Dwarf_Die *cu_die, struct disasm_line *dl); #endif - /** @e_machine: ELF machine associated with arch. */ - unsigned int e_machine; - /** @e_flags: Optional ELF flags associated with arch. */ - unsigned int e_flags; }; struct ins { @@ -107,7 +108,7 @@ struct annotate_args { char *fileloc; }; -const struct arch *arch__find(const char *name); +const struct arch *arch__find(uint16_t e_machine, const char *cpuid); bool arch__is_x86(const struct arch *arch); bool arch__is_powerpc(const struct arch *arch); @@ -121,17 +122,17 @@ extern const struct ins_ops ret_ops; int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops); -int arc__annotate_init(struct arch *arch, char *cpuid); -int arm__annotate_init(struct arch *arch, char *cpuid); -int arm64__annotate_init(struct arch *arch, char *cpuid); -int csky__annotate_init(struct arch *arch, char *cpuid); -int loongarch__annotate_init(struct arch *arch, char *cpuid); -int mips__annotate_init(struct arch *arch, char *cpuid); -int powerpc__annotate_init(struct arch *arch, char *cpuid); -int riscv64__annotate_init(struct arch *arch, char *cpuid); -int s390__annotate_init(struct arch *arch, char *cpuid); -int sparc__annotate_init(struct arch *arch, char *cpuid); -int x86__annotate_init(struct arch *arch, char *cpuid); +const struct arch *arch__new_arc(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_arm(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_arm64(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_csky(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_loongarch(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_mips(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_powerpc(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_riscv64(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_s390(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_sparc(const struct e_machine_and_e_flags *id, const char *cpuid); +const struct arch *arch__new_x86(const struct e_machine_and_e_flags *id, const char *cpuid); const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl); -- cgit v1.2.3 From dc329efc162ac168e2a0c83d1334608371dd525b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 13:35:16 -0800 Subject: perf disasm: Minor layout tweaks for 'struct arch' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pack some holes to bring down the overall struct size from 96 to 88 bytes. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Athira Rajeev Cc: Bill Wendling Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Julia Lawall Cc: Justin Stitt Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Suchit Karunakaran Cc: Thomas Falcon Cc: Tianyou Li Cc: Will Deacon Cc: Zecheng Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 2793d48aa04e..6a1905f9d4fc 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -18,22 +18,22 @@ struct type_state; struct disasm_line; struct e_machine_and_e_flags { - uint16_t e_machine; uint32_t e_flags; + uint16_t e_machine; }; struct arch { - /** @id: ELF machine and flags associated with arch. */ - struct e_machine_and_e_flags id; /** @name: name such as "x86" or "powerpc". */ const char *name; const struct ins *instructions; size_t nr_instructions; size_t nr_instructions_allocated; - bool sorted_instructions; const char *insn_suffix; unsigned int model; unsigned int family; + /** @id: ELF machine and flags associated with arch. */ + struct e_machine_and_e_flags id; + bool sorted_instructions; struct { char comment_char; char skip_functions_char; -- cgit v1.2.3 From e786a04b4a5461dd7e2829422314a5a6d5a664d9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 09:58:46 -0800 Subject: perf inject: With --convert-callchain ignore the dummy event for dwarf stacks On hybrid systems there is generally >1 event and a dummy event. The perf inject --convert-callchain option is failing to convert perf.data files on such systems reporting "--convert-callchain requires DWARF call graph." The failing event is the dummy event that doesn't need to be set up for samples. As such ignore this event when checking the evsels. Fixes: 92ea788d2af4e65a ("perf inject: Add --convert-callchain option") Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 2c9456614cde..5b29f4296861 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2729,7 +2729,7 @@ int cmd_inject(int argc, const char **argv) } evlist__for_each_entry(inject.session->evlist, evsel) { - if (!evsel__has_dwarf_callchain(evsel)) { + if (!evsel__has_dwarf_callchain(evsel) && !evsel__is_dummy_event(evsel)) { pr_err("--convert-callchain requires DWARF call graph.\n"); goto out_delete; } -- cgit v1.2.3 From c5e47e4d00fbc15f2390bb6ed8d9c21836363291 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jan 2026 09:53:37 -0800 Subject: perf tests sched: Avoid error in cleanup on loaded machines The stop_noploops function will kill the noploop processes that are running for 10 seconds. On a loaded machine they may have already terminated meaning the kill will return an error of no such process. This doesn't matter and so ignore the error to avoid the test terminating in the cleanup. Fixes: 0e22c5ca44e68798 ("perf test: Add sched latency and script shell tests") Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/sched.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/sched.sh b/tools/perf/tests/shell/sched.sh index b9b81eaf856e..b9637069adb1 100755 --- a/tools/perf/tests/shell/sched.sh +++ b/tools/perf/tests/shell/sched.sh @@ -53,7 +53,7 @@ start_noploops() { } cleanup_noploops() { - kill "$PID1" "$PID2" + kill "$PID1" "$PID2" || true } test_sched_record() { -- cgit v1.2.3 From f0d98c78f8bf73ce2a9b7793f66cda240fa9ab10 Mon Sep 17 00:00:00 2001 From: Suchit Karunakaran Date: Thu, 22 Jan 2026 22:47:04 +0530 Subject: perf annotate: Fix memcpy size in arch__grow_instructions() The memcpy() in arch__grow_instructions() is copying the wrong number of bytes when growing from a non-allocated table. It should copy arch->nr_instructions * sizeof(struct ins) bytes, not just arch->nr_instructions bytes. This bug causes data corruption as only a partial copy of the instruction table is made, leading to garbage data in most entries and potential crashes Fixes: 2a1ff812c40be982 ("perf annotate: Introduce alternative method of keeping instructions table") Reviewed-by: Ian Rogers Signed-off-by: Suchit Karunakaran Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 4f60726247d6..9b0ba1fc5aec 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -82,7 +82,7 @@ grow_from_non_allocated_table: if (new_instructions == NULL) return -1; - memcpy(new_instructions, arch->instructions, arch->nr_instructions); + memcpy(new_instructions, arch->instructions, arch->nr_instructions * sizeof(struct ins)); goto out_update_instructions; } -- cgit v1.2.3 From 3d06db9bad1ad8e67c3981964cfba224c07fc306 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 20 Jan 2026 18:17:35 -0800 Subject: perf regs: Refactor use of arch__sample_reg_masks() to perf_reg_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch__sample_reg_masks isn't supported on ARM(32), csky, loongarch, MIPS, RISC-V and s390. The table returned by the function just has the name of a register paired with the corresponding sample_regs_user mask value. For a given perf register we can compute the name with perf_reg_name and the mask is just 1 left-shifted by the perf register number. Change __parse_regs to use this method for finding registers rather than arch__sample_reg_masks, thereby adding __parse_regs support for ARM(32), csky, loongarch, MIPS, RISC-V and s390. As arch__sample_reg_masks is then unused, remove the now unneeded declarations. Signed-off-by: Ian Rogers Tested-by: Thomas Richter Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andi Kleen Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Dapeng Mi Cc: Dmitriy Vyukov Cc: Dr. David Alan Gilbert Cc: Guo Ren Cc: Haibo Xu Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Krzysztof Łopatowski Cc: Leo Yan Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Thomas Falcon Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/perf_regs.c | 9 -- tools/perf/arch/arm64/util/machine.c | 14 +-- tools/perf/arch/arm64/util/perf_regs.c | 45 +------- tools/perf/arch/csky/util/perf_regs.c | 9 -- tools/perf/arch/loongarch/util/perf_regs.c | 9 -- tools/perf/arch/mips/util/perf_regs.c | 9 -- tools/perf/arch/powerpc/util/perf_regs.c | 68 ------------ tools/perf/arch/riscv/util/perf_regs.c | 9 -- tools/perf/arch/s390/util/perf_regs.c | 9 -- tools/perf/arch/x86/util/perf_regs.c | 47 --------- .../perf/util/arm64-frame-pointer-unwind-support.c | 3 +- tools/perf/util/parse-regs-options.c | 116 +++++++++++++-------- tools/perf/util/perf_regs.c | 9 -- tools/perf/util/perf_regs.h | 12 --- 14 files changed, 81 insertions(+), 287 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c index f94a0210c7b7..03a5bc0cf64c 100644 --- a/tools/perf/arch/arm/util/perf_regs.c +++ b/tools/perf/arch/arm/util/perf_regs.c @@ -2,10 +2,6 @@ #include "perf_regs.h" #include "../../../util/perf_regs.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - uint64_t arch__intr_reg_mask(void) { return PERF_REGS_MASK; @@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c index aab1cc2bc283..80fb13c958d9 100644 --- a/tools/perf/arch/arm64/util/machine.c +++ b/tools/perf/arch/arm64/util/machine.c @@ -1,18 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include "debug.h" -#include "symbol.h" -#include "callchain.h" +#include "callchain.h" // prototype of arch__add_leaf_frame_record_opts #include "perf_regs.h" #include "record.h" -#include "util/perf_regs.h" + +#define SMPL_REG_MASK(b) (1ULL << (b)) void arch__add_leaf_frame_record_opts(struct record_opts *opts) { - const struct sample_reg *sample_reg_masks = arch__sample_reg_masks(); - - opts->sample_user_regs |= sample_reg_masks[PERF_REG_ARM64_LR].mask; + opts->sample_user_regs |= SMPL_REG_MASK(PERF_REG_ARM64_LR); } diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c index 09308665e28a..9bb768e1bea1 100644 --- a/tools/perf/arch/arm64/util/perf_regs.c +++ b/tools/perf/arch/arm64/util/perf_regs.c @@ -12,48 +12,12 @@ #include "../../../util/event.h" #include "../../../util/perf_regs.h" +#define SMPL_REG_MASK(b) (1ULL << (b)) + #ifndef HWCAP_SVE #define HWCAP_SVE (1 << 22) #endif -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG(x0, PERF_REG_ARM64_X0), - SMPL_REG(x1, PERF_REG_ARM64_X1), - SMPL_REG(x2, PERF_REG_ARM64_X2), - SMPL_REG(x3, PERF_REG_ARM64_X3), - SMPL_REG(x4, PERF_REG_ARM64_X4), - SMPL_REG(x5, PERF_REG_ARM64_X5), - SMPL_REG(x6, PERF_REG_ARM64_X6), - SMPL_REG(x7, PERF_REG_ARM64_X7), - SMPL_REG(x8, PERF_REG_ARM64_X8), - SMPL_REG(x9, PERF_REG_ARM64_X9), - SMPL_REG(x10, PERF_REG_ARM64_X10), - SMPL_REG(x11, PERF_REG_ARM64_X11), - SMPL_REG(x12, PERF_REG_ARM64_X12), - SMPL_REG(x13, PERF_REG_ARM64_X13), - SMPL_REG(x14, PERF_REG_ARM64_X14), - SMPL_REG(x15, PERF_REG_ARM64_X15), - SMPL_REG(x16, PERF_REG_ARM64_X16), - SMPL_REG(x17, PERF_REG_ARM64_X17), - SMPL_REG(x18, PERF_REG_ARM64_X18), - SMPL_REG(x19, PERF_REG_ARM64_X19), - SMPL_REG(x20, PERF_REG_ARM64_X20), - SMPL_REG(x21, PERF_REG_ARM64_X21), - SMPL_REG(x22, PERF_REG_ARM64_X22), - SMPL_REG(x23, PERF_REG_ARM64_X23), - SMPL_REG(x24, PERF_REG_ARM64_X24), - SMPL_REG(x25, PERF_REG_ARM64_X25), - SMPL_REG(x26, PERF_REG_ARM64_X26), - SMPL_REG(x27, PERF_REG_ARM64_X27), - SMPL_REG(x28, PERF_REG_ARM64_X28), - SMPL_REG(x29, PERF_REG_ARM64_X29), - SMPL_REG(lr, PERF_REG_ARM64_LR), - SMPL_REG(sp, PERF_REG_ARM64_SP), - SMPL_REG(pc, PERF_REG_ARM64_PC), - SMPL_REG(vg, PERF_REG_ARM64_VG), - SMPL_REG_END -}; - /* %xNUM */ #define SDT_OP_REGEX1 "^(x[1-2]?[0-9]|3[0-1])$" @@ -175,8 +139,3 @@ uint64_t arch__user_reg_mask(void) } return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/csky/util/perf_regs.c b/tools/perf/arch/csky/util/perf_regs.c index 6b1665f41180..2cf7a54106e0 100644 --- a/tools/perf/arch/csky/util/perf_regs.c +++ b/tools/perf/arch/csky/util/perf_regs.c @@ -2,10 +2,6 @@ #include "perf_regs.h" #include "../../util/perf_regs.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - uint64_t arch__intr_reg_mask(void) { return PERF_REGS_MASK; @@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/loongarch/util/perf_regs.c b/tools/perf/arch/loongarch/util/perf_regs.c index f94a0210c7b7..03a5bc0cf64c 100644 --- a/tools/perf/arch/loongarch/util/perf_regs.c +++ b/tools/perf/arch/loongarch/util/perf_regs.c @@ -2,10 +2,6 @@ #include "perf_regs.h" #include "../../../util/perf_regs.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - uint64_t arch__intr_reg_mask(void) { return PERF_REGS_MASK; @@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c index 6b1665f41180..2cf7a54106e0 100644 --- a/tools/perf/arch/mips/util/perf_regs.c +++ b/tools/perf/arch/mips/util/perf_regs.c @@ -2,10 +2,6 @@ #include "perf_regs.h" #include "../../util/perf_regs.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - uint64_t arch__intr_reg_mask(void) { return PERF_REGS_MASK; @@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c index bd36cfd420a2..779073f7e992 100644 --- a/tools/perf/arch/powerpc/util/perf_regs.c +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -18,69 +18,6 @@ #define PVR_POWER10 0x0080 #define PVR_POWER11 0x0082 -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG(r0, PERF_REG_POWERPC_R0), - SMPL_REG(r1, PERF_REG_POWERPC_R1), - SMPL_REG(r2, PERF_REG_POWERPC_R2), - SMPL_REG(r3, PERF_REG_POWERPC_R3), - SMPL_REG(r4, PERF_REG_POWERPC_R4), - SMPL_REG(r5, PERF_REG_POWERPC_R5), - SMPL_REG(r6, PERF_REG_POWERPC_R6), - SMPL_REG(r7, PERF_REG_POWERPC_R7), - SMPL_REG(r8, PERF_REG_POWERPC_R8), - SMPL_REG(r9, PERF_REG_POWERPC_R9), - SMPL_REG(r10, PERF_REG_POWERPC_R10), - SMPL_REG(r11, PERF_REG_POWERPC_R11), - SMPL_REG(r12, PERF_REG_POWERPC_R12), - SMPL_REG(r13, PERF_REG_POWERPC_R13), - SMPL_REG(r14, PERF_REG_POWERPC_R14), - SMPL_REG(r15, PERF_REG_POWERPC_R15), - SMPL_REG(r16, PERF_REG_POWERPC_R16), - SMPL_REG(r17, PERF_REG_POWERPC_R17), - SMPL_REG(r18, PERF_REG_POWERPC_R18), - SMPL_REG(r19, PERF_REG_POWERPC_R19), - SMPL_REG(r20, PERF_REG_POWERPC_R20), - SMPL_REG(r21, PERF_REG_POWERPC_R21), - SMPL_REG(r22, PERF_REG_POWERPC_R22), - SMPL_REG(r23, PERF_REG_POWERPC_R23), - SMPL_REG(r24, PERF_REG_POWERPC_R24), - SMPL_REG(r25, PERF_REG_POWERPC_R25), - SMPL_REG(r26, PERF_REG_POWERPC_R26), - SMPL_REG(r27, PERF_REG_POWERPC_R27), - SMPL_REG(r28, PERF_REG_POWERPC_R28), - SMPL_REG(r29, PERF_REG_POWERPC_R29), - SMPL_REG(r30, PERF_REG_POWERPC_R30), - SMPL_REG(r31, PERF_REG_POWERPC_R31), - SMPL_REG(nip, PERF_REG_POWERPC_NIP), - SMPL_REG(msr, PERF_REG_POWERPC_MSR), - SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3), - SMPL_REG(ctr, PERF_REG_POWERPC_CTR), - SMPL_REG(link, PERF_REG_POWERPC_LINK), - SMPL_REG(xer, PERF_REG_POWERPC_XER), - SMPL_REG(ccr, PERF_REG_POWERPC_CCR), - SMPL_REG(softe, PERF_REG_POWERPC_SOFTE), - SMPL_REG(trap, PERF_REG_POWERPC_TRAP), - SMPL_REG(dar, PERF_REG_POWERPC_DAR), - SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR), - SMPL_REG(sier, PERF_REG_POWERPC_SIER), - SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA), - SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0), - SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1), - SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2), - SMPL_REG(mmcr3, PERF_REG_POWERPC_MMCR3), - SMPL_REG(sier2, PERF_REG_POWERPC_SIER2), - SMPL_REG(sier3, PERF_REG_POWERPC_SIER3), - SMPL_REG(pmc1, PERF_REG_POWERPC_PMC1), - SMPL_REG(pmc2, PERF_REG_POWERPC_PMC2), - SMPL_REG(pmc3, PERF_REG_POWERPC_PMC3), - SMPL_REG(pmc4, PERF_REG_POWERPC_PMC4), - SMPL_REG(pmc5, PERF_REG_POWERPC_PMC5), - SMPL_REG(pmc6, PERF_REG_POWERPC_PMC6), - SMPL_REG(sdar, PERF_REG_POWERPC_SDAR), - SMPL_REG(siar, PERF_REG_POWERPC_SIAR), - SMPL_REG_END -}; - /* REG or %rREG */ #define SDT_OP_REGEX1 "^(%r)?([1-2]?[0-9]|3[0-1])$" @@ -233,8 +170,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/riscv/util/perf_regs.c b/tools/perf/arch/riscv/util/perf_regs.c index 6b1665f41180..2cf7a54106e0 100644 --- a/tools/perf/arch/riscv/util/perf_regs.c +++ b/tools/perf/arch/riscv/util/perf_regs.c @@ -2,10 +2,6 @@ #include "perf_regs.h" #include "../../util/perf_regs.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - uint64_t arch__intr_reg_mask(void) { return PERF_REGS_MASK; @@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/s390/util/perf_regs.c b/tools/perf/arch/s390/util/perf_regs.c index 6b1665f41180..2cf7a54106e0 100644 --- a/tools/perf/arch/s390/util/perf_regs.c +++ b/tools/perf/arch/s390/util/perf_regs.c @@ -2,10 +2,6 @@ #include "perf_regs.h" #include "../../util/perf_regs.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - uint64_t arch__intr_reg_mask(void) { return PERF_REGS_MASK; @@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void) { return PERF_REGS_MASK; } - -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index 12fd93f04802..a7ca4154fdf9 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -13,48 +13,6 @@ #include "../../../util/pmu.h" #include "../../../util/pmus.h" -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG(AX, PERF_REG_X86_AX), - SMPL_REG(BX, PERF_REG_X86_BX), - SMPL_REG(CX, PERF_REG_X86_CX), - SMPL_REG(DX, PERF_REG_X86_DX), - SMPL_REG(SI, PERF_REG_X86_SI), - SMPL_REG(DI, PERF_REG_X86_DI), - SMPL_REG(BP, PERF_REG_X86_BP), - SMPL_REG(SP, PERF_REG_X86_SP), - SMPL_REG(IP, PERF_REG_X86_IP), - SMPL_REG(FLAGS, PERF_REG_X86_FLAGS), - SMPL_REG(CS, PERF_REG_X86_CS), - SMPL_REG(SS, PERF_REG_X86_SS), -#ifdef HAVE_ARCH_X86_64_SUPPORT - SMPL_REG(R8, PERF_REG_X86_R8), - SMPL_REG(R9, PERF_REG_X86_R9), - SMPL_REG(R10, PERF_REG_X86_R10), - SMPL_REG(R11, PERF_REG_X86_R11), - SMPL_REG(R12, PERF_REG_X86_R12), - SMPL_REG(R13, PERF_REG_X86_R13), - SMPL_REG(R14, PERF_REG_X86_R14), - SMPL_REG(R15, PERF_REG_X86_R15), -#endif - SMPL_REG2(XMM0, PERF_REG_X86_XMM0), - SMPL_REG2(XMM1, PERF_REG_X86_XMM1), - SMPL_REG2(XMM2, PERF_REG_X86_XMM2), - SMPL_REG2(XMM3, PERF_REG_X86_XMM3), - SMPL_REG2(XMM4, PERF_REG_X86_XMM4), - SMPL_REG2(XMM5, PERF_REG_X86_XMM5), - SMPL_REG2(XMM6, PERF_REG_X86_XMM6), - SMPL_REG2(XMM7, PERF_REG_X86_XMM7), - SMPL_REG2(XMM8, PERF_REG_X86_XMM8), - SMPL_REG2(XMM9, PERF_REG_X86_XMM9), - SMPL_REG2(XMM10, PERF_REG_X86_XMM10), - SMPL_REG2(XMM11, PERF_REG_X86_XMM11), - SMPL_REG2(XMM12, PERF_REG_X86_XMM12), - SMPL_REG2(XMM13, PERF_REG_X86_XMM13), - SMPL_REG2(XMM14, PERF_REG_X86_XMM14), - SMPL_REG2(XMM15, PERF_REG_X86_XMM15), - SMPL_REG_END -}; - struct sdt_name_reg { const char *sdt_name; const char *uprobe_name; @@ -276,11 +234,6 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op) return SDT_ARG_VALID; } -const struct sample_reg *arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} - uint64_t arch__intr_reg_mask(void) { struct perf_event_attr attr = { diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.c b/tools/perf/util/arm64-frame-pointer-unwind-support.c index 958afe8b821e..858ce2b01812 100644 --- a/tools/perf/util/arm64-frame-pointer-unwind-support.c +++ b/tools/perf/util/arm64-frame-pointer-unwind-support.c @@ -2,7 +2,6 @@ #include "arm64-frame-pointer-unwind-support.h" #include "callchain.h" #include "event.h" -#include "perf_regs.h" // SMPL_REG_MASK #include "unwind.h" #include @@ -15,6 +14,8 @@ struct entries { size_t length; }; +#define SMPL_REG_MASK(b) (1ULL << (b)) + static bool get_leaf_frame_caller_enabled(struct perf_sample *sample) { struct regs_dump *regs; diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c index cda1c620968e..c0d0ef9fd495 100644 --- a/tools/perf/util/parse-regs-options.c +++ b/tools/perf/util/parse-regs-options.c @@ -5,15 +5,54 @@ #include #include #include "util/debug.h" +#include #include #include "util/perf_regs.h" #include "util/parse-regs-options.h" +static void list_perf_regs(FILE *fp, uint64_t mask) +{ + const char *last_name = NULL; + + fprintf(fp, "available registers: "); + for (int reg = 0; reg < 64; reg++) { + const char *name; + + if (((1ULL << reg) & mask) == 0) + continue; + + name = perf_reg_name(reg, EM_HOST); + if (name && (!last_name || strcmp(last_name, name))) + fprintf(fp, "%s%s", reg > 0 ? " " : "", name); + last_name = name; + } + fputc('\n', fp); +} + +static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask) +{ + uint64_t reg_mask = 0; + + for (int reg = 0; reg < 64; reg++) { + const char *name; + + if (((1ULL << reg) & mask) == 0) + continue; + + name = perf_reg_name(reg, EM_HOST); + if (!name) + continue; + + if (!strcasecmp(to_match, name)) + reg_mask |= 1ULL << reg; + } + return reg_mask; +} + static int __parse_regs(const struct option *opt, const char *str, int unset, bool intr) { uint64_t *mode = (uint64_t *)opt->value; - const struct sample_reg *r = NULL; char *s, *os = NULL, *p; int ret = -1; uint64_t mask; @@ -27,50 +66,41 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr) if (*mode) return -1; - if (intr) - mask = arch__intr_reg_mask(); - else - mask = arch__user_reg_mask(); - /* str may be NULL in case no arg is passed to -I */ - if (str) { - /* because str is read-only */ - s = os = strdup(str); - if (!s) - return -1; - - for (;;) { - p = strchr(s, ','); - if (p) - *p = '\0'; - - if (!strcmp(s, "?")) { - fprintf(stderr, "available registers: "); - for (r = arch__sample_reg_masks(); r->name; r++) { - if (r->mask & mask) - fprintf(stderr, "%s ", r->name); - } - fputc('\n', stderr); - /* just printing available regs */ - goto error; - } - for (r = arch__sample_reg_masks(); r->name; r++) { - if ((r->mask & mask) && !strcasecmp(s, r->name)) - break; - } - if (!r || !r->name) { - ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n", - s, intr ? "-I" : "--user-regs="); - goto error; - } - - *mode |= r->mask; - - if (!p) - break; - - s = p + 1; + if (!str) + return -1; + + mask = intr ? arch__intr_reg_mask() : arch__user_reg_mask(); + + /* because str is read-only */ + s = os = strdup(str); + if (!s) + return -1; + + for (;;) { + uint64_t reg_mask; + + p = strchr(s, ','); + if (p) + *p = '\0'; + + if (!strcmp(s, "?")) { + list_perf_regs(stderr, mask); + goto error; + } + + reg_mask = name_to_perf_reg_mask(s, mask); + if (reg_mask == 0) { + ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n", + s, intr ? "-I" : "--user-regs="); + goto error; } + *mode |= reg_mask; + + if (!p) + break; + + s = p + 1; } ret = 0; diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index f9723091e673..cd5acee3dc62 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -23,15 +23,6 @@ uint64_t __weak arch__user_reg_mask(void) return 0; } -static const struct sample_reg sample_reg_masks[] = { - SMPL_REG_END -}; - -const struct sample_reg * __weak arch__sample_reg_masks(void) -{ - return sample_reg_masks; -} - const char *perf_reg_name(int id, uint16_t e_machine) { const char *reg_name = NULL; diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h index 7bfc6a34c02b..2c2a8de6912d 100644 --- a/tools/perf/util/perf_regs.h +++ b/tools/perf/util/perf_regs.h @@ -7,17 +7,6 @@ struct regs_dump; -struct sample_reg { - const char *name; - uint64_t mask; -}; - -#define SMPL_REG_MASK(b) (1ULL << (b)) -#define SMPL_REG(n, b) { .name = #n, .mask = SMPL_REG_MASK(b) } -#define SMPL_REG2_MASK(b) (3ULL << (b)) -#define SMPL_REG2(n, b) { .name = #n, .mask = SMPL_REG2_MASK(b) } -#define SMPL_REG_END { .name = NULL } - enum { SDT_ARG_VALID = 0, SDT_ARG_SKIP, @@ -26,7 +15,6 @@ enum { int arch_sdt_arg_parse_op(char *old_op, char **new_op); uint64_t arch__intr_reg_mask(void); uint64_t arch__user_reg_mask(void); -const struct sample_reg *arch__sample_reg_masks(void); const char *perf_reg_name(int id, uint16_t e_machine); int perf_reg_value(u64 *valp, struct regs_dump *regs, int id); -- cgit v1.2.3 From d8df878140506e7938928195f540c10b1089fdaf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jan 2026 21:51:22 -0800 Subject: selftests/bpf: Fix task_local_data failure with 64K page On arm64 systems with 64K pages, the selftest task_local_data has the following failures: ... test_task_local_data_basic:PASS:tld_create_key 0 nsec test_task_local_data_basic:FAIL:tld_create_key unexpected tld_create_key: actual 0 != expected -28 ... test_task_local_data_basic_thread:PASS:run task_main 0 nsec test_task_local_data_basic_thread:FAIL:task_main retval unexpected error: 2 (errno 0) test_task_local_data_basic_thread:FAIL:tld_get_data value0 unexpected tld_get_data value0: actual 0 != expected 6268 ... #447/1 task_local_data/task_local_data_basic:FAIL ... #447/2 task_local_data/task_local_data_race:FAIL #447 task_local_data:FAIL When TLD_DYN_DATA_SIZE is 64K page size, for struct tld_meta_u { _Atomic __u8 cnt; __u16 size; struct tld_metadata metadata[]; }; field 'cnt' would overflow. For example, for 4K page, 'cnt' will be 4096/64 = 64. But for 64K page, 'cnt' will be 65536/64 = 1024 and 'cnt' is not enough for 1024. To accommodate 64K page, '_Atomic __u8 cnt' becomes '_Atomic __u16 cnt'. A few other places are adjusted accordingly. In test_task_local_data.c, the value for TLD_DYN_DATA_SIZE is changed from 4096 to (getpagesize() - 8) since the maximum buffer size for TLD_DYN_DATA_SIZE is (getpagesize() - 8). Reviewed-by: Alan Maguire Tested-by: Alan Maguire Cc: Amery Hung Signed-off-by: Yonghong Song Acked-by: Amery Hung Link: https://lore.kernel.org/r/20260123055122.494352-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/task_local_data.h | 4 ++-- tools/testing/selftests/bpf/prog_tests/test_task_local_data.c | 2 +- tools/testing/selftests/bpf/progs/task_local_data.bpf.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 2de38776a2d4..0f86b9275cf9 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -94,7 +94,7 @@ struct tld_metadata { }; struct tld_meta_u { - _Atomic __u8 cnt; + _Atomic __u16 cnt; __u16 size; struct tld_metadata metadata[]; }; @@ -217,7 +217,7 @@ out: static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) { int err, i, sz, off = 0; - __u8 cnt; + __u16 cnt; if (!TLD_READ_ONCE(tld_meta_p)) { err = __tld_init_meta_p(); diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 9fd6306b455c..9556ad3d986f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -4,7 +4,7 @@ #include #define TLD_FREE_DATA_ON_THREAD_EXIT -#define TLD_DYN_DATA_SIZE 4096 +#define TLD_DYN_DATA_SIZE (getpagesize() - 8) #include "task_local_data.h" struct test_tld_struct { diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h index 432fff2af844..fed53d63a7e5 100644 --- a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -80,7 +80,7 @@ struct tld_metadata { }; struct tld_meta_u { - __u8 cnt; + __u16 cnt; __u16 size; struct tld_metadata metadata[TLD_MAX_DATA_CNT]; }; -- cgit v1.2.3 From c7900f225a102219f5fe2c1c93a7dec5467315ee Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jan 2026 21:51:28 -0800 Subject: selftests/bpf: Fix xdp_pull_data failure with 64K page If the argument 'pull_len' of run_test() is 'PULL_MAX' or 'PULL_MAX | PULL_PLUS_ONE', the eventual pull_len size will close to the page size. On arm64 systems with 64K pages, the pull_len size will be close to 64K. But the existing buffer will be close to 9000 which is not enough to pull. For those failed run_tests(), make buff size to pg_sz + (pg_sz / 2) This way, there will be enough buffer space to pull regardless of page size. Tested-by: Alan Maguire Cc: Amery Hung Signed-off-by: Yonghong Song Acked-by: Amery Hung Link: https://lore.kernel.org/r/20260123055128.495265-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c index efa350d04ec5..910dabe95afd 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -114,12 +114,14 @@ static void test_xdp_pull_data_basic(void) { u32 pg_sz, max_meta_len, max_data_len; struct test_xdp_pull_data *skel; + int buff_len; skel = test_xdp_pull_data__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) return; pg_sz = sysconf(_SC_PAGE_SIZE); + buff_len = pg_sz + pg_sz / 2; if (find_xdp_sizes(skel, pg_sz)) goto out; @@ -140,13 +142,13 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); /* multi-buf pkt, empty linear data area, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, 0, PULL_MAX); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX); /* multi-buf pkt, no tailroom, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, max_data_len, PULL_MAX); /* Test cases with invalid pull length */ @@ -154,18 +156,18 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); /* multi-buf pkt with no space left in linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, max_data_len, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, empty linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, 0, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no tailroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, max_data_len, PULL_MAX | PULL_PLUS_ONE); out: -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 +++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++++---- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) -- cgit v1.2.3 From 8fe4dc4f6456b3d2c9e6f8aeb1f978b7bff0f6c8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:58 +0800 Subject: bpf: change prototype of bpf_session_{cookie,is_return} Add the function argument of "void *ctx" to bpf_session_cookie() and bpf_session_is_return(), which is a preparation of the next patch. The two kfunc is seldom used now, so it will not introduce much effect to change their function prototype. Signed-off-by: Menglong Dong Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260124062008.8657-4-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- kernel/trace/bpf_trace.c | 4 ++-- tools/testing/selftests/bpf/bpf_kfuncs.h | 3 --- .../selftests/bpf/progs/kprobe_multi_session_cookie.c | 15 +++++++-------- tools/testing/selftests/bpf/progs/uprobe_multi_session.c | 7 +++---- .../selftests/bpf/progs/uprobe_multi_session_cookie.c | 15 +++++++-------- .../selftests/bpf/progs/uprobe_multi_session_recursive.c | 11 +++++------ 7 files changed, 29 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2081343a848d..0fa73d56cb8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12484,6 +12484,7 @@ enum special_kfunc_type { KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, + KF_bpf_session_is_return, }; BTF_ID_LIST(special_kfunc_list) @@ -12561,6 +12562,7 @@ BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) +BTF_ID(func, bpf_session_is_return) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12615,7 +12617,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; - if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_session_is_return] || + meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; if (argno + 1 < nargs && diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d466a1503da3..13f0a2de33b7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3323,7 +3323,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) __bpf_kfunc_start_defs(); -__bpf_kfunc bool bpf_session_is_return(void) +__bpf_kfunc bool bpf_session_is_return(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3331,7 +3331,7 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } -__bpf_kfunc __u64 *bpf_session_cookie(void) +__bpf_kfunc __u64 *bpf_session_cookie(void *ctx) { struct bpf_session_run_ctx *session_ctx; diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index e0189254bb6e..7dad01439391 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -79,9 +79,6 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; -extern bool bpf_session_is_return(void) __ksym __weak; -extern __u64 *bpf_session_cookie(void) __ksym __weak; - struct dentry; /* Description * Returns xattr of a dentry diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c index 0835b5edf685..ad627016e3e5 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -23,16 +22,16 @@ int BPF_PROG(trigger) return 0; } -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -42,17 +41,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_kprobe_1_result); + return check_cookie(ctx, 1, &test_kprobe_1_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_kprobe_2_result); + return check_cookie(ctx, 2, &test_kprobe_2_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_kprobe_3_result); + return check_cookie(ctx, 3, &test_kprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c index 30bff90b68dc..6e46bb00ff58 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -51,7 +50,7 @@ static int uprobe_multi_check(void *ctx, bool is_return) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*") int uprobe(struct pt_regs *ctx) { - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } static __always_inline bool verify_sleepable_user_copy(void) @@ -67,5 +66,5 @@ int uprobe_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_multi_sleep_result++; - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c index 5befdf944dc6..b5db196614a9 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -13,16 +12,16 @@ __u64 test_uprobe_1_result = 0; __u64 test_uprobe_2_result = 0; __u64 test_uprobe_3_result = 0; -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -32,17 +31,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1") int uprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_uprobe_1_result); + return check_cookie(ctx, 1, &test_uprobe_1_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2") int uprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_uprobe_2_result); + return check_cookie(ctx, 2, &test_uprobe_2_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3") int uprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_uprobe_3_result); + return check_cookie(ctx, 3, &test_uprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c index 8fbcd69fae22..3ce309248a04 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -16,11 +15,11 @@ int idx_return = 0; __u64 test_uprobe_cookie_entry[6]; __u64 test_uprobe_cookie_return[3]; -static int check_cookie(void) +static int check_cookie(struct pt_regs *ctx) { - __u64 *cookie = bpf_session_cookie(); + __u64 *cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) { + if (bpf_session_is_return(ctx)) { if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return)) return 1; test_uprobe_cookie_return[idx_return++] = *cookie; @@ -40,5 +39,5 @@ int uprobe_recursive(struct pt_regs *ctx) if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - return check_cookie(); + return check_cookie(ctx); } -- cgit v1.2.3 From 257c43688b143fd9805cdfef9d2623dde92989e6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:03 +0800 Subject: libbpf: add fsession support Add BPF_TRACE_FSESSION to libbpf. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-9-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/libbpf.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 21b57a629916..5846de364209 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -794,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: case BPF_LSM_MAC: attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); if (!OPTS_ZEROED(opts, tracing)) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index bbcfd72b07d5..0c8bf0b5cce4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_FENTRY] = "trace_fentry", [BPF_TRACE_FEXIT] = "trace_fexit", [BPF_MODIFY_RETURN] = "modify_return", + [BPF_TRACE_FSESSION] = "trace_fsession", [BPF_LSM_MAC] = "lsm_mac", [BPF_LSM_CGROUP] = "lsm_cgroup", [BPF_SK_LOOKUP] = "sk_lookup", @@ -9859,6 +9860,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fsession+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fsession.s+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), -- cgit v1.2.3 From 85fc4be6d811372f8f9a2a131a092735418fdbf2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:04 +0800 Subject: bpftool: add fsession support Add BPF_TRACE_FSESSION to bpftool. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-10-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index e8daf963ecef..8bfcff9e2f63 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -1191,6 +1191,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t) case BPF_TRACE_FENTRY: return "fentry"; case BPF_TRACE_FEXIT: return "fexit"; case BPF_MODIFY_RETURN: return "mod_ret"; + case BPF_TRACE_FSESSION: return "fsession"; case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select"; case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate"; default: return libbpf_bpf_attach_type_str(t); -- cgit v1.2.3 From f7afef5617b685c3491db3593ca09abc33815774 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:05 +0800 Subject: selftests/bpf: add testcases for fsession Add testcases for BPF_TRACE_FSESSION. The function arguments and return value are tested both in the entry and exit. And the kfunc bpf_session_is_ret() is also tested. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-11-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fsession_test.c | 90 ++++++++++++++++++++ tools/testing/selftests/bpf/progs/fsession_test.c | 97 ++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/fsession_test.c create mode 100644 tools/testing/selftests/bpf/progs/fsession_test.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c new file mode 100644 index 000000000000..75bb42942b67 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include +#include "fsession_test.skel.h" + +static int check_result(struct fsession_test *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* Trigger test function calls */ + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return err; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return topts.retval; + + for (int i = 0; i < sizeof(*skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(((__u64 *)skel->bss)[i], 1, "test_result")) + return -EINVAL; + } + + return 0; +} + +static void test_fsession_basic(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + +static void test_fsession_reattach(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + goto cleanup; + + /* first attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_first_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + + /* detach */ + fsession_test__detach(skel); + + /* reset counters */ + memset(skel->bss, 0, sizeof(*skel->bss)); + + /* second attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_second_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + +cleanup: + fsession_test__destroy(skel); +} + +void test_fsession_test(void) +{ +#if !defined(__x86_64__) + test__skip(); + return; +#endif + if (test__start_subtest("fsession_test")) + test_fsession_basic(); + if (test__start_subtest("fsession_reattach")) + test_fsession_reattach(); +} diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c new file mode 100644 index 000000000000..0e1b66b2dddc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 test1_entry_result = 0; +__u64 test1_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test1, int a, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test1_entry_result = a == 1 && ret == 0; + return 0; + } + + test1_exit_result = a == 1 && ret == 2; + return 0; +} + +__u64 test2_entry_result = 0; +__u64 test2_exit_result = 0; + +SEC("fsession/bpf_fentry_test3") +int BPF_PROG(test2, char a, int b, __u64 c, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test2_entry_result = a == 4 && b == 5 && c == 6 && ret == 0; + return 0; + } + + test2_exit_result = a == 4 && b == 5 && c == 6 && ret == 15; + return 0; +} + +__u64 test3_entry_result = 0; +__u64 test3_exit_result = 0; + +SEC("fsession/bpf_fentry_test4") +int BPF_PROG(test3, void *a, char b, int c, __u64 d, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test3_entry_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 0; + return 0; + } + + test3_exit_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 34; + return 0; +} + +__u64 test4_entry_result = 0; +__u64 test4_exit_result = 0; + +SEC("fsession/bpf_fentry_test5") +int BPF_PROG(test4, __u64 a, void *b, short c, int d, __u64 e, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test4_entry_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 0; + return 0; + } + + test4_exit_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 65; + return 0; +} + +__u64 test5_entry_result = 0; +__u64 test5_exit_result = 0; + +SEC("fsession/bpf_fentry_test7") +int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + if (!arg) + test5_entry_result = ret == 0; + return 0; + } + + if (!arg) + test5_exit_result = 1; + return 0; +} + -- cgit v1.2.3 From a5533a6eaa5b602fe54e53d85f787e09eab4e771 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:06 +0800 Subject: selftests/bpf: test bpf_get_func_* for fsession Test following bpf helper for fsession: bpf_get_func_arg() bpf_get_func_arg_cnt() bpf_get_func_ret() bpf_get_func_ip() Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-12-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/get_func_args_test.c | 1 + .../selftests/bpf/prog_tests/get_func_ip_test.c | 2 ++ .../selftests/bpf/progs/get_func_args_test.c | 40 +++++++++++++++++++++- .../testing/selftests/bpf/progs/get_func_ip_test.c | 23 +++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index fadee95d3ae8..96b27de05524 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -41,6 +41,7 @@ void test_get_func_args_test(void) ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c index c40242dfa8fb..7772a0f288d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -46,6 +46,8 @@ static void test_function_entry(void) ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); ASSERT_EQ(skel->bss->test8_result, 1, "test8_result"); + ASSERT_EQ(skel->bss->test9_entry_result, 1, "test9_entry_result"); + ASSERT_EQ(skel->bss->test9_exit_result, 1, "test9_exit_result"); cleanup: get_func_ip_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 5b7233afef05..0a3236a7a109 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include @@ -165,3 +165,41 @@ int BPF_PROG(tp_test2) return 0; } + +__u64 test7_result = 0; +#ifdef __TARGET_ARCH_x86 +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0, ret = 0; + __s64 err; + + test7_result = cnt == 1; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test7_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test7_result &= err == -EINVAL; + + if (bpf_session_is_return(ctx)) { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 2; + } else { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 0; + } + + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test7) +{ + test7_result = 1; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 2011cacdeb18..65f7e1f182bf 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -103,3 +103,26 @@ int BPF_URETPROBE(test8, int ret) test8_result = (const void *) addr == (const void *) uprobe_trigger; return 0; } + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; +#ifdef __TARGET_ARCH_x86 +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test9_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test9_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + test9_entry_result = test9_exit_result = 1; + return 0; +} +#endif -- cgit v1.2.3 From 8909b3fb23e245f8ade903dfcfcc43522cf28a56 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:07 +0800 Subject: selftests/bpf: add testcases for fsession cookie Test session cookie for fsession. Multiple fsession BPF progs is attached to bpf_fentry_test1() and session cookie is read and write in the testcase. bpf_get_func_ip() will influence the layout of the session cookies, so we test the cookie in two case: with and without bpf_get_func_ip(). Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-13-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fsession_test.c | 34 +++++++++++ tools/testing/selftests/bpf/progs/fsession_test.c | 66 ++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 75bb42942b67..0c4b428e1cee 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -77,6 +77,38 @@ cleanup: fsession_test__destroy(skel); } +static void test_fsession_cookie(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + goto cleanup; + + /* + * The test_fsession_basic() will test the session cookie with + * bpf_get_func_ip() case, so we need only check + * the cookie without bpf_get_func_ip() case here + */ + bpf_program__set_autoload(skel->progs.test6, false); + + err = fsession_test__load(skel); + if (!ASSERT_OK(err, "fsession_test__load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + skel->bss->test6_entry_result = 1; + skel->bss->test6_exit_result = 1; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + void test_fsession_test(void) { #if !defined(__x86_64__) @@ -87,4 +119,6 @@ void test_fsession_test(void) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) test_fsession_reattach(); + if (test__start_subtest("fsession_cookie")) + test_fsession_cookie(); } diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c index 0e1b66b2dddc..211332bdcccb 100644 --- a/tools/testing/selftests/bpf/progs/fsession_test.c +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -95,3 +95,69 @@ int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) return 0; } +__u64 test6_entry_result = 0; +__u64 test6_exit_result = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test6, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test6_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test6_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} + +__u64 test7_entry_ok = 0; +__u64 test7_exit_ok = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0xAAAABBBBCCCCDDDDull; + test7_entry_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; + } + + test7_exit_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; +} + +__u64 test8_entry_ok = 0; +__u64 test8_exit_ok = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test8, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0x1111222233334444ull; + test8_entry_ok = *cookie == 0x1111222233334444ull; + return 0; + } + + test8_exit_ok = *cookie == 0x1111222233334444ull; + return 0; +} + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a, int ret) +{ + __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + test9_entry_result = a == 1 && ret == 0; + *cookie = 0x123456ULL; + return 0; + } + + test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; + return 0; +} -- cgit v1.2.3 From cb4bfacfb0110aa1b10ab60c64a3df0e176998c5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:08 +0800 Subject: selftests/bpf: test fsession mixed with fentry and fexit Test the fsession when it is used together with fentry, fexit. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-14-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/fsession_test.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c index 211332bdcccb..86e8a2fe467e 100644 --- a/tools/testing/selftests/bpf/progs/fsession_test.c +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -161,3 +161,19 @@ int BPF_PROG(test9, int a, int ret) test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; return 0; } + +__u64 test10_result = 0; +SEC("fexit/bpf_fentry_test1") +int BPF_PROG(test10, int a, int ret) +{ + test10_result = a == 1 && ret == 2; + return 0; +} + +__u64 test11_result = 0; +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test11, int a) +{ + test11_result = a == 1; + return 0; +} -- cgit v1.2.3 From c31df36bd26a5ed8898bb3fcc8c37ea9157ba784 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 25 Jan 2026 20:54:12 +0900 Subject: selftests/bpf: Introduce execution context detection helpers Introduce bpf_in_nmi(), bpf_in_hardirq(), bpf_in_serving_softirq(), and bpf_in_task() inline helpers in bpf_experimental.h. These allow BPF programs to query the current execution context with higher granularity than the existing bpf_in_interrupt() helper. While BPF programs can often infer their context from attachment points, subsystems like sched_ext may call the same BPF logic from multiple contexts (e.g., task-to-task wake-ups vs. interrupt-to-task wake-ups). These helpers provide a reliable way for logic to branch based on the current CPU execution state. Implementing these as BPF-native inline helpers wrapping get_preempt_count() allows the compiler and JIT to inline the logic. The implementation accounts for differences in preempt_count layout between standard and PREEMPT_RT kernels. Reviewed-by: Alexei Starovoitov Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260125115413.117502-2-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 68a49b1f77ae..a39576c8ba04 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -610,6 +610,8 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) + extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 extern const int __preempt_count __ksym; @@ -648,4 +650,60 @@ static inline int bpf_in_interrupt(void) (tsk->softirq_disable_cnt & SOFTIRQ_MASK); } +/* Description + * Report whether it is in NMI context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_nmi(void) +{ + return get_preempt_count() & NMI_MASK; +} + +/* Description + * Report whether it is in hard IRQ context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_hardirq(void) +{ + return get_preempt_count() & HARDIRQ_MASK; +} + +/* Description + * Report whether it is in softirq context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_serving_softirq(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return (pcnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; + + tsk = (void *) bpf_get_current_task_btf(); + return (tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; +} + +/* Description + * Report whether it is in task context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_task(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return !(pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + tsk = (void *) bpf_get_current_task_btf(); + return !((pcnt & (NMI_MASK | HARDIRQ_MASK)) | + ((tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET)); +} #endif -- cgit v1.2.3 From 221b5e76c1c6e8ad4fa7c95a689e44ff45daab1c Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 25 Jan 2026 20:54:13 +0900 Subject: selftests/bpf: Add tests for execution context helpers Add a new selftest suite `exe_ctx` to verify the accuracy of the bpf_in_task(), bpf_in_hardirq(), and bpf_in_serving_softirq() helpers introduced in bpf_experimental.h. Testing these execution contexts deterministically requires crossing context boundaries within a single CPU. To achieve this, the test implements a "Trigger-Observer" pattern using bpf_testmod: 1. Trigger: A BPF syscall program calls a new bpf_testmod kfunc bpf_kfunc_trigger_ctx_check(). 2. Task to HardIRQ: The kfunc uses irq_work_queue() to trigger a self-IPI on the local CPU. 3. HardIRQ to SoftIRQ: The irq_work handler calls a dummy function (observed by BPF fentry) and then schedules a tasklet to transition into SoftIRQ context. The user-space runner ensures determinism by pinning itself to CPU 0 before execution, forcing the entire interrupt chain to remain on a single core. Dummy noinline functions with compiler barriers are added to bpf_testmod.c to serve as stable attachment points for fentry programs. A retry loop is used in user-space to wait for the asynchronous SoftIRQ to complete. Note that testing on s390x is avoided because supporting those helpers purely in BPF on s390x is not possible at this point. Reviewed-by: Alexei Starovoitov Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260125115413.117502-3-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/prog_tests/exe_ctx.c | 59 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_ctx.c | 48 ++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 32 ++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 4 ++ 5 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/exe_ctx.c create mode 100644 tools/testing/selftests/bpf/progs/test_ctx.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a17baf8c6fd7..f7e1e5f5511c 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -1,4 +1,5 @@ # TEMPORARY # Alphabetical order +exe_ctx # execution context check (e.g., hardirq, softirq, etc) get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) diff --git a/tools/testing/selftests/bpf/prog_tests/exe_ctx.c b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c new file mode 100644 index 000000000000..aed6a6ef0876 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min + */ + +#include +#include +#include "test_ctx.skel.h" + +void test_exe_ctx(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + cpu_set_t old_cpuset, target_cpuset; + struct test_ctx *skel; + int err, prog_fd; + + /* 1. Pin the current process to CPU 0. */ + if (sched_getaffinity(0, sizeof(old_cpuset), &old_cpuset) == 0) { + CPU_ZERO(&target_cpuset); + CPU_SET(0, &target_cpuset); + ASSERT_OK(sched_setaffinity(0, sizeof(target_cpuset), + &target_cpuset), "setaffinity"); + } + + skel = test_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto restore_affinity; + + err = test_ctx__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* 2. When we run this, the kernel will execute the BPF prog on CPU 0. */ + prog_fd = bpf_program__fd(skel->progs.trigger_all_contexts); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "test_run_trigger"); + + /* 3. Wait for the local CPU's softirq/tasklet to finish. */ + for (int i = 0; i < 1000; i++) { + if (skel->bss->count_task > 0 && + skel->bss->count_hardirq > 0 && + skel->bss->count_softirq > 0) + break; + usleep(1000); /* Wait 1ms per iteration, up to 1 sec total */ + } + + /* On CPU 0, these should now all be non-zero. */ + ASSERT_GT(skel->bss->count_task, 0, "task_ok"); + ASSERT_GT(skel->bss->count_hardirq, 0, "hardirq_ok"); + ASSERT_GT(skel->bss->count_softirq, 0, "softirq_ok"); + +cleanup: + test_ctx__destroy(skel); + +restore_affinity: + ASSERT_OK(sched_setaffinity(0, sizeof(old_cpuset), &old_cpuset), + "restore_affinity"); +} diff --git a/tools/testing/selftests/bpf/progs/test_ctx.c b/tools/testing/selftests/bpf/progs/test_ctx.c new file mode 100644 index 000000000000..7d4995506717 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ctx.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min + */ + +#include "vmlinux.h" +#include +#include +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +extern void bpf_kfunc_trigger_ctx_check(void) __ksym; + +int count_hardirq; +int count_softirq; +int count_task; + +/* Triggered via bpf_prog_test_run from user-space */ +SEC("syscall") +int trigger_all_contexts(void *ctx) +{ + if (bpf_in_task()) + __sync_fetch_and_add(&count_task, 1); + + /* Trigger the firing of a hardirq and softirq for test. */ + bpf_kfunc_trigger_ctx_check(); + return 0; +} + +/* Observer for HardIRQ */ +SEC("fentry/bpf_testmod_test_hardirq_fn") +int BPF_PROG(on_hardirq) +{ + if (bpf_in_hardirq()) + __sync_fetch_and_add(&count_hardirq, 1); + return 0; +} + +/* Observer for SoftIRQ */ +SEC("fentry/bpf_testmod_test_softirq_fn") +int BPF_PROG(on_softirq) +{ + if (bpf_in_serving_softirq()) + __sync_fetch_and_add(&count_softirq, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 77a81fa8ec6a..186a25ab429a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1168,6 +1168,33 @@ __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); +/* hook targets */ +noinline void bpf_testmod_test_hardirq_fn(void) { barrier(); } +noinline void bpf_testmod_test_softirq_fn(void) { barrier(); } + +/* Tasklet for SoftIRQ context */ +static void ctx_check_tasklet_fn(struct tasklet_struct *t) +{ + bpf_testmod_test_softirq_fn(); +} + +DECLARE_TASKLET(ctx_check_tasklet, ctx_check_tasklet_fn); + +/* IRQ Work for HardIRQ context */ +static void ctx_check_irq_fn(struct irq_work *work) +{ + bpf_testmod_test_hardirq_fn(); + tasklet_schedule(&ctx_check_tasklet); +} + +static struct irq_work ctx_check_irq = IRQ_WORK_INIT_HARD(ctx_check_irq_fn); + +/* The kfunc trigger */ +__bpf_kfunc void bpf_kfunc_trigger_ctx_check(void) +{ + irq_work_queue(&ctx_check_irq); +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1213,6 +1240,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) +BTF_ID_FLAGS(func, bpf_kfunc_trigger_ctx_check) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1844,6 +1872,10 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + /* Clean up irqwork and tasklet */ + irq_work_sync(&ctx_check_irq); + tasklet_kill(&ctx_check_tasklet); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); unregister_bpf_testmod_uprobe(); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 10f89f06245f..d5c5454e257e 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -169,4 +169,8 @@ extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; +void bpf_testmod_test_hardirq_fn(void); +void bpf_testmod_test_softirq_fn(void); +void bpf_kfunc_trigger_ctx_check(void) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 8e5bcc3a955a2cc4460b391f55d3b49905eb248e Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Sun, 25 Jan 2026 08:57:46 +0000 Subject: selftests: ublk: add missing gitignore for metadata_size binary A new utility metadata_size was added in commit 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size") but it was not added to .gitignore. Fix that by adding it there. While at it sort all entries alphabetically and add a SPDX license header. Reviewed-by: Caleb Sander Mateos Fixes: 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size") Signed-off-by: Alexander Atanasov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/.gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore index 8b2871ea7751..e17bd28f27e0 100644 --- a/tools/testing/selftests/ublk/.gitignore +++ b/tools/testing/selftests/ublk/.gitignore @@ -1,3 +1,5 @@ -kublk -/tools +# SPDX-License-Identifier: GPL-2.0 *-verify.state +/tools +kublk +metadata_size -- cgit v1.2.3 From 1742272bd3fae6362301d0f11eb9db9030348afc Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 21 Jan 2026 20:44:09 +0100 Subject: selftests: net: add ipv6 ping to local address from localhost Test ipv6 pinging to local configured address and linklocal address from localhost with -I ::1. Signed-off-by: Fernando Fernandez Mancera Reviewed-by: David Ahern Link: https://patch.msgid.link/20260121194409.6749-2-fmancera@suse.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fcnal-test.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 844a580ae74e..890c3f8e51bb 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -2327,6 +2327,13 @@ ipv6_ping_novrf() log_test_addr ${a} $? 2 "ping local, device bind" done + for a in ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSA_DEV} ${NSA_IP6} + do + log_start + run_cmd ${ping6} -c1 -w1 -I ::1 ${a} + log_test_addr ${a} $? 0 "ping local, from localhost" + done + # # ip rule blocks address # -- cgit v1.2.3 From 3c58f03e805f8f9025f09fe393103947dca49c57 Mon Sep 17 00:00:00 2001 From: I-Hsin Cheng Date: Sat, 24 Jan 2026 20:32:41 +0800 Subject: kselftest/arm64: Add missing file in .gitignore The binary generated by check_hugetlb_options is missing in .gitignore under the directory. Add it into the file so it won't be logged into version control. Signed-off-by: I-Hsin Cheng Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index 052d0f9f92b3..f6937f890039 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -6,3 +6,4 @@ check_mmap_options check_prctl check_ksm_options check_user_mem +check_hugetlb_options -- cgit v1.2.3 From 87a6e3b6c494ac519548c30b82b0d87b233b9649 Mon Sep 17 00:00:00 2001 From: Khushal Chitturi Date: Wed, 19 Nov 2025 01:22:58 +0530 Subject: xdrgen: improve error reporting for invalid void declarations RFC 4506 defines void as a zero-length type that may appear only as union arms or as program argument/result types. It cannot be declared with an identifier, so constructs like "typedef void temp;" are not valid XDR. Previously, xdrgen raised a NotImplementedError when it encountered a void declaration in a typedef. Which was misleading, as the problem is an invalid RPC specification rather than missing functionality in xdrgen. This patch replaces the NotImplementedError for _XdrVoid in typedef handling with a clearer ValueError that specifies incorrect use of void in the XDR input, making it clear that the issue lies in the RPC specification being parsed. Signed-off-by: Khushal Chitturi Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/typedef.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/generators/typedef.py b/tools/net/sunrpc/xdrgen/generators/typedef.py index fab72e9d6915..75e3a40e14e1 100644 --- a/tools/net/sunrpc/xdrgen/generators/typedef.py +++ b/tools/net/sunrpc/xdrgen/generators/typedef.py @@ -58,7 +58,7 @@ def emit_typedef_declaration(environment: Environment, node: _XdrDeclaration) -> elif isinstance(node, _XdrOptionalData): raise NotImplementedError(" typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError(" typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") @@ -104,7 +104,7 @@ def emit_type_definition(environment: Environment, node: _XdrDeclaration) -> Non elif isinstance(node, _XdrOptionalData): raise NotImplementedError(" typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError(" typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") @@ -165,7 +165,7 @@ def emit_typedef_decoder(environment: Environment, node: _XdrDeclaration) -> Non elif isinstance(node, _XdrOptionalData): raise NotImplementedError(" typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError(" typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") @@ -225,7 +225,7 @@ def emit_typedef_encoder(environment: Environment, node: _XdrDeclaration) -> Non elif isinstance(node, _XdrOptionalData): raise NotImplementedError(" typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError(" typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") -- cgit v1.2.3 From 9654a0388a3abcfa51cb2d010a6624a6f1f46710 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 20 Nov 2025 15:15:52 -0500 Subject: xdrgen: Generate "if" instead of "switch" for boolean union enumerators Eliminate this warning in code generated by xdrgen: fs/nfsd/nfs3xdr_gen.c:220:2: warning: switch condition has boolean value [-Wswitch-bool] 220 | switch (ptr->attributes_follow) { | ^ ~~~~~~~~~~~~~~~~~~~~~~ No more -Wswitch-bool warnings when compiling with W=1. The generated code is functionally equivalent but somewhat more idiomatic. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511172336.Y75zj4v6-lkp@intel.com/ Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/union.py | 115 +++++++++++++++++---- .../xdrgen/templates/C/union/decoder/bool_spec.j2 | 7 ++ .../xdrgen/templates/C/union/encoder/bool_spec.j2 | 7 ++ 3 files changed, 109 insertions(+), 20 deletions(-) create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py index ad1f214ef22a..d15837dae651 100644 --- a/tools/net/sunrpc/xdrgen/generators/union.py +++ b/tools/net/sunrpc/xdrgen/generators/union.py @@ -84,6 +84,31 @@ def emit_union_switch_spec_decoder( print(template.render(name=node.name, type=node.spec.type_name)) +def emit_union_arm_decoder( + environment: Environment, node: _XdrCaseSpec +) -> None: + """Emit decoder for an XDR union's arm (data only, no case/break)""" + + if isinstance(node.arm, _XdrVoid): + return + if isinstance(node.arm, _XdrString): + type_name = "char *" + classifier = "" + else: + type_name = node.arm.spec.type_name + classifier = node.arm.spec.c_classifier + + assert isinstance(node.arm, (_XdrBasic, _XdrString)) + template = get_jinja2_template(environment, "decoder", node.arm.template) + print( + template.render( + name=node.arm.name, + type=type_name, + classifier=classifier, + ) + ) + + def emit_union_case_spec_decoder( environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool ) -> None: @@ -151,19 +176,33 @@ def emit_union_decoder(environment: Environment, node: _XdrUnion) -> None: template = get_jinja2_template(environment, "decoder", "open") print(template.render(name=node.name)) - emit_union_switch_spec_decoder(environment, node.discriminant) + # For boolean discriminants, use if statement instead of switch + if node.discriminant.spec.type_name == "bool": + template = get_jinja2_template(environment, "decoder", "bool_spec") + print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name)) - for case in node.cases: - emit_union_case_spec_decoder( - environment, - case, - node.discriminant.spec.type_name in big_endian, - ) + # Find and emit the TRUE case + for case in node.cases: + if case.values and case.values[0] == "TRUE": + emit_union_arm_decoder(environment, case) + break - emit_union_default_spec_decoder(environment, node) + template = get_jinja2_template(environment, "decoder", "close") + print(template.render()) + else: + emit_union_switch_spec_decoder(environment, node.discriminant) - template = get_jinja2_template(environment, "decoder", "close") - print(template.render()) + for case in node.cases: + emit_union_case_spec_decoder( + environment, + case, + node.discriminant.spec.type_name in big_endian, + ) + + emit_union_default_spec_decoder(environment, node) + + template = get_jinja2_template(environment, "decoder", "close") + print(template.render()) def emit_union_switch_spec_encoder( @@ -175,6 +214,28 @@ def emit_union_switch_spec_encoder( print(template.render(name=node.name, type=node.spec.type_name)) +def emit_union_arm_encoder( + environment: Environment, node: _XdrCaseSpec +) -> None: + """Emit encoder for an XDR union's arm (data only, no case/break)""" + + if isinstance(node.arm, _XdrVoid): + return + if isinstance(node.arm, _XdrString): + type_name = "char *" + else: + type_name = node.arm.spec.type_name + + assert isinstance(node.arm, (_XdrBasic, _XdrString)) + template = get_jinja2_template(environment, "encoder", node.arm.template) + print( + template.render( + name=node.arm.name, + type=type_name, + ) + ) + + def emit_union_case_spec_encoder( environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool ) -> None: @@ -235,19 +296,33 @@ def emit_union_encoder(environment, node: _XdrUnion) -> None: template = get_jinja2_template(environment, "encoder", "open") print(template.render(name=node.name)) - emit_union_switch_spec_encoder(environment, node.discriminant) + # For boolean discriminants, use if statement instead of switch + if node.discriminant.spec.type_name == "bool": + template = get_jinja2_template(environment, "encoder", "bool_spec") + print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name)) - for case in node.cases: - emit_union_case_spec_encoder( - environment, - case, - node.discriminant.spec.type_name in big_endian, - ) + # Find and emit the TRUE case + for case in node.cases: + if case.values and case.values[0] == "TRUE": + emit_union_arm_encoder(environment, case) + break - emit_union_default_spec_encoder(environment, node) + template = get_jinja2_template(environment, "encoder", "close") + print(template.render()) + else: + emit_union_switch_spec_encoder(environment, node.discriminant) - template = get_jinja2_template(environment, "encoder", "close") - print(template.render()) + for case in node.cases: + emit_union_case_spec_encoder( + environment, + case, + node.discriminant.spec.type_name in big_endian, + ) + + emit_union_default_spec_encoder(environment, node) + + template = get_jinja2_template(environment, "encoder", "close") + print(template.render()) def emit_union_maxsize(environment: Environment, node: _XdrUnion) -> None: diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 new file mode 100644 index 000000000000..05ad491f74af --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 @@ -0,0 +1,7 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +{% if annotate %} + /* discriminant {{ name }} */ +{% endif %} + if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }})) + return false; + if (ptr->{{ name }}) { diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 new file mode 100644 index 000000000000..e5135ed6471c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 @@ -0,0 +1,7 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +{% if annotate %} + /* discriminant {{ name }} */ +{% endif %} + if (!xdrgen_encode_{{ type }}(xdr, ptr->{{ name }})) + return false; + if (ptr->{{ name }}) { -- cgit v1.2.3 From 4329010ad9c36775e7092e451c37c24c4f90243f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Dec 2025 17:19:46 -0500 Subject: xdrgen: Address some checkpatch whitespace complaints This is a roll-up of three template fixes that eliminate noise from checkpatch output so that it's easier to spot non-trivial problems. To follow conventional kernel C style, when a union declaration is marked with "pragma public", there should be a blank line between the emitted "union xxx { ... };" and the decoder and encoder function declarations. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 | 1 - tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 | 1 + tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 | 1 + tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 | 1 + 4 files changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 index d1405c7c5354..c7ae506076bb 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 @@ -1,4 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} - bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr); bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, {{ name }} value); diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 index a07586cbee17..446266ad6d17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 @@ -1,3 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} }; + typedef enum {{ name }} {{ name }}; diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 index 2c18948bddf7..cfeee2287e68 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 @@ -1,3 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} }; + typedef __be32 {{ name }}; diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 index 01d716d0099e..5fc1937ba774 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 @@ -3,6 +3,7 @@ }; {%- if name in public_apis %} + bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr); bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *ptr); {%- endif -%} -- cgit v1.2.3 From bf0fe9ad3d597d8e1378dc9953ca96dfc3addb2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 8 Dec 2025 11:15:32 -0500 Subject: xdrgen: Fix struct prefix for typedef types in program wrappers The program templates for decoder/argument.j2 and encoder/result.j2 unconditionally add 'struct' prefix to all types. This is incorrect when an RPC protocol specification lists a typedef'd basic type or an enum as a procedure argument or result (e.g., NFSv2's fhandle or stat), resulting in compiler errors when building generated C code. Fixes: 4b132aacb076 ("tools: Add xdrgen") Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/__init__.py | 3 ++- tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 | 4 ++++ tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py index e22632cf38fb..1d577a986c6c 100644 --- a/tools/net/sunrpc/xdrgen/generators/__init__.py +++ b/tools/net/sunrpc/xdrgen/generators/__init__.py @@ -6,7 +6,7 @@ from pathlib import Path from jinja2 import Environment, FileSystemLoader, Template from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier -from xdr_ast import public_apis, pass_by_reference, get_header_name +from xdr_ast import public_apis, pass_by_reference, structs, get_header_name from xdr_parse import get_xdr_annotate @@ -25,6 +25,7 @@ def create_jinja2_environment(language: str, xdr_type: str) -> Environment: environment.globals["annotate"] = get_xdr_annotate() environment.globals["public_apis"] = public_apis environment.globals["pass_by_reference"] = pass_by_reference + environment.globals["structs"] = structs return environment case _: raise NotImplementedError("Language not supported") diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 index 0b1709cca0d4..19b219dd276d 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 @@ -14,7 +14,11 @@ bool {{ program }}_svc_decode_{{ argument }}(struct svc_rqst *rqstp, struct xdr_ {% if argument == 'void' %} return xdrgen_decode_void(xdr); {% else %} +{% if argument in structs %} struct {{ argument }} *argp = rqstp->rq_argp; +{% else %} + {{ argument }} *argp = rqstp->rq_argp; +{% endif %} return xdrgen_decode_{{ argument }}(xdr, argp); {% endif %} diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 index 6fc61a5d47b7..746592cfda56 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 @@ -14,8 +14,14 @@ bool {{ program }}_svc_encode_{{ result }}(struct svc_rqst *rqstp, struct xdr_st {% if result == 'void' %} return xdrgen_encode_void(xdr); {% else %} +{% if result in structs %} struct {{ result }} *resp = rqstp->rq_resp; return xdrgen_encode_{{ result }}(xdr, resp); +{% else %} + {{ result }} *resp = rqstp->rq_resp; + + return xdrgen_encode_{{ result }}(xdr, *resp); +{% endif %} {% endif %} } -- cgit v1.2.3 From 288d9ddbb74f52e07b1e2bc628768f7847dcb7e6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 10 Dec 2025 09:04:24 -0500 Subject: xdrgen: Emit the program number definition "xdrgen definitions" was not providing a definition of a symbolic constant for the RPC program number being defined. Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/program.py | 3 +++ tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 | 5 +++++ 2 files changed, 8 insertions(+) create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py index ac3cf1694b68..decb092ef02c 100644 --- a/tools/net/sunrpc/xdrgen/generators/program.py +++ b/tools/net/sunrpc/xdrgen/generators/program.py @@ -127,6 +127,9 @@ class XdrProgramGenerator(SourceGenerator): for version in node.versions: emit_version_definitions(self.environment, program, version) + template = self.environment.get_template("definition/program.j2") + print(template.render(name=raw_name, value=node.number)) + def emit_declaration(self, node: _RpcProgram) -> None: """Emit a declaration pair for each of an RPC programs's procedures""" raw_name = node.name diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 new file mode 100644 index 000000000000..320663ffc37f --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 @@ -0,0 +1,5 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +#ifndef {{ name }} +#define {{ name }} ({{ value }}) +#endif -- cgit v1.2.3 From ae78eb497868f335919db83b82eb59849c6cf251 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 16 Dec 2025 11:23:09 -0500 Subject: xdrgen: Implement short (16-bit) integer types "short" and "unsigned short" types are not defined in RFC 4506, but are supported by the rpcgen program. An upcoming protocol specification includes at least one "unsigned short" field, so xdrgen needs to implement support for these types. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/_builtins.h | 60 ++++++++++++++++++++++++++ tools/net/sunrpc/xdrgen/generators/__init__.py | 2 + tools/net/sunrpc/xdrgen/grammars/xdr.lark | 4 ++ tools/net/sunrpc/xdrgen/xdr_ast.py | 4 ++ 4 files changed, 70 insertions(+) (limited to 'tools') diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h index 66ca3ece951a..52ed9a9151c4 100644 --- a/include/linux/sunrpc/xdrgen/_builtins.h +++ b/include/linux/sunrpc/xdrgen/_builtins.h @@ -46,6 +46,66 @@ xdrgen_encode_bool(struct xdr_stream *xdr, bool val) return true; } +/* + * De facto (non-standard but commonly implemented) signed short type: + * - Wire sends sign-extended 32-bit value (e.g., 0xFFFFFFFF) + * - be32_to_cpup() returns u32 (0xFFFFFFFF) + * - Explicit (s16) cast truncates to 16 bits (0xFFFF = -1) + */ +static inline bool +xdrgen_decode_short(struct xdr_stream *xdr, s16 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = (s16)be32_to_cpup(p); + return true; +} + +/* + * De facto (non-standard but commonly implemented) signed short type: + * - C integer promotion sign-extends s16 val to int before passing to + * cpu_to_be32() + * - This is well-defined: -1 as s16 -1 as int 0xFFFFFFFF on wire + */ +static inline bool +xdrgen_encode_short(struct xdr_stream *xdr, s16 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = cpu_to_be32(val); + return true; +} + +/* + * De facto (non-standard but commonly implemented) unsigned short type: + * 16-bit integer zero-extended to fill one XDR_UNIT. + */ +static inline bool +xdrgen_decode_unsigned_short(struct xdr_stream *xdr, u16 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = (u16)be32_to_cpup(p); + return true; +} + +static inline bool +xdrgen_encode_unsigned_short(struct xdr_stream *xdr, u16 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = cpu_to_be32(val); + return true; +} + static inline bool xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr) { diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py index 1d577a986c6c..5c3a4a47ded8 100644 --- a/tools/net/sunrpc/xdrgen/generators/__init__.py +++ b/tools/net/sunrpc/xdrgen/generators/__init__.py @@ -59,6 +59,8 @@ def kernel_c_type(spec: _XdrTypeSpecifier) -> str: """Return name of C type""" builtin_native_c_type = { "bool": "bool", + "short": "s16", + "unsigned_short": "u16", "int": "s32", "unsigned_int": "u32", "long": "s32", diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark index 7c2c1b8c86d1..b7c664f2acb7 100644 --- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark +++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark @@ -20,9 +20,11 @@ constant : decimal_constant | hexadecimal_constant | octal_consta type_specifier : unsigned_hyper | unsigned_long | unsigned_int + | unsigned_short | hyper | long | int + | short | float | double | quadruple @@ -35,9 +37,11 @@ type_specifier : unsigned_hyper unsigned_hyper : "unsigned" "hyper" unsigned_long : "unsigned" "long" unsigned_int : "unsigned" "int" +unsigned_short : "unsigned" "short" hyper : "hyper" long : "long" int : "int" +short : "short" float : "float" double : "double" quadruple : "quadruple" diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py index 5233e73c7046..2b5d160a0a60 100644 --- a/tools/net/sunrpc/xdrgen/xdr_ast.py +++ b/tools/net/sunrpc/xdrgen/xdr_ast.py @@ -34,6 +34,8 @@ def xdr_quadlen(val: str) -> int: symbolic_widths = { "void": ["XDR_void"], "bool": ["XDR_bool"], + "short": ["XDR_short"], + "unsigned_short": ["XDR_unsigned_short"], "int": ["XDR_int"], "unsigned_int": ["XDR_unsigned_int"], "long": ["XDR_long"], @@ -48,6 +50,8 @@ symbolic_widths = { max_widths = { "void": 0, "bool": 1, + "short": 1, + "unsigned_short": 1, "int": 1, "unsigned_int": 1, "long": 1, -- cgit v1.2.3 From eb1f3b55ac6202a013daf14ed508066947cdafa8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 22 Dec 2025 09:44:29 -0500 Subject: xdrgen: Remove inclusion of nlm4.h header The client-side source code template mistakenly includes the nlm4.h header file, which is specific to the NLM protocol and should not be present in the generic template that generates client stubs for all XDR-based protocols. Fixes: 903a7d37d9ea ("xdrgen: Update the files included in client-side source code") Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 index c5518c519854..df3598c38b2c 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 @@ -8,6 +8,5 @@ #include #include #include -#include #include -- cgit v1.2.3 From 9abb3549227e4fb70f0d8ba515bf7ddd249ad710 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 22 Dec 2025 09:44:59 -0500 Subject: xdrgen: Improve parse error reporting The current verbose Lark exception output makes it difficult to quickly identify and fix syntax errors in XDR specifications. Users must wade through hundreds of lines of cascading errors to find the root cause. Replace this with concise, compiler-style error messages showing file, line, column, the unexpected token, and the source line with a caret pointing to the error location. Before: Unexpected token Token('__ANON_1', '+1') at line 14, column 35. Expected one of: * SEMICOLON Previous tokens: [Token('__ANON_0', 'LM_MAXSTRLEN')] [hundreds more cascading errors...] After: file.x:14:35: parse error Unexpected number '+1' const LM_MAXNAMELEN = LM_MAXSTRLEN+1; ^ The error handler now raises XdrParseError on the first error, preventing cascading messages that obscure the root cause. Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/subcmds/declarations.py | 16 ++--- tools/net/sunrpc/xdrgen/subcmds/definitions.py | 16 ++--- tools/net/sunrpc/xdrgen/subcmds/lint.py | 17 +++-- tools/net/sunrpc/xdrgen/subcmds/source.py | 16 ++--- tools/net/sunrpc/xdrgen/xdr_parse.py | 86 +++++++++++++++++++++++++ tools/net/sunrpc/xdrgen/xdrgen | 2 - 6 files changed, 118 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py index c5e8d79986ef..2fd5c255a547 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py +++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py @@ -8,7 +8,6 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator @@ -24,6 +23,7 @@ from xdr_ast import transform_parse_tree, _RpcProgram, Specification from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import make_error_handler, XdrParseError logger.setLevel(logging.INFO) @@ -50,19 +50,19 @@ def emit_header_declarations( gen.emit_declaration(definition.value) -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Generate definitions and declarations""" set_xdr_annotate(args.annotate) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 ast = transform_parse_tree(parse_tree) gen = XdrHeaderTopGenerator(args.language, args.peer) diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py index c956e27f37c0..8ea5c57cc37a 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py +++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py @@ -8,7 +8,6 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator @@ -24,6 +23,7 @@ from xdr_ast import transform_parse_tree, Specification from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import make_error_handler, XdrParseError logger.setLevel(logging.INFO) @@ -69,19 +69,19 @@ def emit_header_maxsize(root: Specification, language: str, peer: str) -> None: gen.emit_maxsize(definition.value) -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Generate definitions""" set_xdr_annotate(args.annotate) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 ast = transform_parse_tree(parse_tree) gen = XdrHeaderTopGenerator(args.language, args.peer) diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py index 36cc43717d30..2c48fa57c4e5 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/lint.py +++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py @@ -8,26 +8,25 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput -from xdr_parse import xdr_parser +from xdr_parse import xdr_parser, make_error_handler, XdrParseError from xdr_ast import transform_parse_tree logger.setLevel(logging.DEBUG) -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Lexical and syntax check of an XDR specification""" parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 transform_parse_tree(parse_tree) return 0 diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py index 2024954748f0..bc7d38802df3 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/source.py +++ b/tools/net/sunrpc/xdrgen/subcmds/source.py @@ -8,7 +8,6 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput from generators.source_top import XdrSourceTopGenerator from generators.enum import XdrEnumGenerator @@ -23,6 +22,7 @@ from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import make_error_handler, XdrParseError logger.setLevel(logging.INFO) @@ -92,19 +92,19 @@ def generate_client_source(filename: str, root: Specification, language: str) -> # cel: todo: client needs PROC macros -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Generate encoder and decoder functions""" set_xdr_annotate(args.annotate) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 ast = transform_parse_tree(parse_tree) match args.peer: case "server": diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py index 964b44e675df..426513be066c 100644 --- a/tools/net/sunrpc/xdrgen/xdr_parse.py +++ b/tools/net/sunrpc/xdrgen/xdr_parse.py @@ -3,12 +3,40 @@ """Common parsing code for xdrgen""" +import sys +from typing import Callable + from lark import Lark +from lark.exceptions import UnexpectedInput, UnexpectedToken # Set to True to emit annotation comments in generated source annotate = False +# Map internal Lark token names to human-readable names +TOKEN_NAMES = { + "__ANON_0": "identifier", + "__ANON_1": "number", + "SEMICOLON": "';'", + "LBRACE": "'{'", + "RBRACE": "'}'", + "LPAR": "'('", + "RPAR": "')'", + "LSQB": "'['", + "RSQB": "']'", + "LESSTHAN": "'<'", + "MORETHAN": "'>'", + "EQUAL": "'='", + "COLON": "':'", + "COMMA": "','", + "STAR": "'*'", + "$END": "end of file", +} + + +class XdrParseError(Exception): + """Raised when XDR parsing fails""" + def set_xdr_annotate(set_it: bool) -> None: """Set 'annotate' if --annotate was specified on the command line""" @@ -21,6 +49,64 @@ def get_xdr_annotate() -> bool: return annotate +def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput], bool]: + """Create an error handler that reports the first parse error and aborts. + + Args: + source: The XDR source text being parsed + filename: The name of the file being parsed + + Returns: + An error handler function for use with Lark's on_error parameter + """ + lines = source.splitlines() + + def handle_parse_error(e: UnexpectedInput) -> bool: + """Report a parse error with context and abort parsing""" + line_num = e.line + column = e.column + line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else "" + + # Build the error message + msg_parts = [f"{filename}:{line_num}:{column}: parse error"] + + # Show what was found vs what was expected + if isinstance(e, UnexpectedToken): + token = e.token + if token.type == "__ANON_0": + found = f"identifier '{token.value}'" + elif token.type == "__ANON_1": + found = f"number '{token.value}'" + else: + found = f"'{token.value}'" + msg_parts.append(f"Unexpected {found}") + + # Provide helpful expected tokens list + expected = e.expected + if expected: + readable = [ + TOKEN_NAMES.get(exp, exp.lower().replace("_", " ")) + for exp in sorted(expected) + ] + if len(readable) == 1: + msg_parts.append(f"Expected {readable[0]}") + elif len(readable) <= 4: + msg_parts.append(f"Expected one of: {', '.join(readable)}") + else: + msg_parts.append(str(e).split("\n")[0]) + + # Show the offending line with a caret pointing to the error + msg_parts.append("") + msg_parts.append(f" {line_text}") + prefix = line_text[: column - 1].expandtabs() + msg_parts.append(f" {' ' * len(prefix)}^") + + sys.stderr.write("\n".join(msg_parts) + "\n") + raise XdrParseError() + + return handle_parse_error + + def xdr_parser() -> Lark: """Return a Lark parser instance configured with the XDR language grammar""" diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen index 3afd0547d67c..e22638f8324b 100755 --- a/tools/net/sunrpc/xdrgen/xdrgen +++ b/tools/net/sunrpc/xdrgen/xdrgen @@ -133,7 +133,5 @@ There is NO WARRANTY, to the extent permitted by law.""", try: if __name__ == "__main__": sys.exit(main()) -except SystemExit: - sys.exit(0) except (KeyboardInterrupt, BrokenPipeError): sys.exit(1) -- cgit v1.2.3 From 63a5425ff5e077c54eb2719c735108e2aa1f9eb6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Dec 2025 10:19:33 -0500 Subject: xdrgen: Extend error reporting to AST transformation phase Commit 277df18d7df9 ("xdrgen: Improve parse error reporting") added clean, compiler-style error messages for syntax errors detected during parsing. However, semantic errors discovered during AST transformation still produce verbose Python stack traces. When an XDR specification references an undefined type, the transformer raises a VisitError wrapping a KeyError. Before this change: Traceback (most recent call last): File ".../lark/visitors.py", line 124, in _call_userfunc return f(children) ... KeyError: 'fsh4_mode' ... lark.exceptions.VisitError: Error trying to process rule "basic": 'fsh4_mode' After this change: file.x:156:2: semantic error Undefined type 'fsh4_mode' fsh4_mode mode; ^ The new handle_transform_error() function extracts position information from the Lark tree node metadata and formats the error consistently with parse error messages. Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/subcmds/declarations.py | 8 ++++- tools/net/sunrpc/xdrgen/subcmds/definitions.py | 8 ++++- tools/net/sunrpc/xdrgen/subcmds/lint.py | 8 ++++- tools/net/sunrpc/xdrgen/subcmds/source.py | 8 ++++- tools/net/sunrpc/xdrgen/xdr_parse.py | 40 ++++++++++++++++++++++++- 5 files changed, 67 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py index 2fd5c255a547..97ffb76a02f1 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py +++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py @@ -8,6 +8,7 @@ import logging from argparse import Namespace from lark import logger +from lark.exceptions import VisitError from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator @@ -24,6 +25,7 @@ from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate from xdr_parse import make_error_handler, XdrParseError +from xdr_parse import handle_transform_error logger.setLevel(logging.INFO) @@ -63,7 +65,11 @@ def subcmd(args: Namespace) -> int: ) except XdrParseError: return 1 - ast = transform_parse_tree(parse_tree) + try: + ast = transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 gen = XdrHeaderTopGenerator(args.language, args.peer) gen.emit_declaration(args.filename, ast) diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py index 8ea5c57cc37a..d6c2dcd6f78f 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py +++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py @@ -8,6 +8,7 @@ import logging from argparse import Namespace from lark import logger +from lark.exceptions import VisitError from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator @@ -24,6 +25,7 @@ from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate from xdr_parse import make_error_handler, XdrParseError +from xdr_parse import handle_transform_error logger.setLevel(logging.INFO) @@ -82,7 +84,11 @@ def subcmd(args: Namespace) -> int: ) except XdrParseError: return 1 - ast = transform_parse_tree(parse_tree) + try: + ast = transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 gen = XdrHeaderTopGenerator(args.language, args.peer) gen.emit_definition(args.filename, ast) diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py index 2c48fa57c4e5..e1da49632e62 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/lint.py +++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py @@ -8,8 +8,10 @@ import logging from argparse import Namespace from lark import logger +from lark.exceptions import VisitError from xdr_parse import xdr_parser, make_error_handler, XdrParseError +from xdr_parse import handle_transform_error from xdr_ast import transform_parse_tree logger.setLevel(logging.DEBUG) @@ -27,6 +29,10 @@ def subcmd(args: Namespace) -> int: ) except XdrParseError: return 1 - transform_parse_tree(parse_tree) + try: + transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 return 0 diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py index bc7d38802df3..08c883f547d7 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/source.py +++ b/tools/net/sunrpc/xdrgen/subcmds/source.py @@ -8,6 +8,7 @@ import logging from argparse import Namespace from lark import logger +from lark.exceptions import VisitError from generators.source_top import XdrSourceTopGenerator from generators.enum import XdrEnumGenerator @@ -23,6 +24,7 @@ from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate from xdr_parse import make_error_handler, XdrParseError +from xdr_parse import handle_transform_error logger.setLevel(logging.INFO) @@ -105,7 +107,11 @@ def subcmd(args: Namespace) -> int: ) except XdrParseError: return 1 - ast = transform_parse_tree(parse_tree) + try: + ast = transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 match args.peer: case "server": generate_server_source(args.filename, ast, args.language) diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py index 426513be066c..38724ad5aea2 100644 --- a/tools/net/sunrpc/xdrgen/xdr_parse.py +++ b/tools/net/sunrpc/xdrgen/xdr_parse.py @@ -7,7 +7,7 @@ import sys from typing import Callable from lark import Lark -from lark.exceptions import UnexpectedInput, UnexpectedToken +from lark.exceptions import UnexpectedInput, UnexpectedToken, VisitError # Set to True to emit annotation comments in generated source @@ -107,6 +107,44 @@ def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput] return handle_parse_error +def handle_transform_error(e: VisitError, source: str, filename: str) -> None: + """Report a transform error with context. + + Args: + e: The VisitError from Lark's transformer + source: The XDR source text being parsed + filename: The name of the file being parsed + """ + lines = source.splitlines() + + # Extract position from the tree node if available + line_num = 0 + column = 0 + if hasattr(e.obj, "meta") and e.obj.meta: + line_num = e.obj.meta.line + column = e.obj.meta.column + + line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else "" + + # Build the error message + msg_parts = [f"{filename}:{line_num}:{column}: semantic error"] + + # The original exception is typically a KeyError for undefined types + if isinstance(e.orig_exc, KeyError): + msg_parts.append(f"Undefined type '{e.orig_exc.args[0]}'") + else: + msg_parts.append(str(e.orig_exc)) + + # Show the offending line with a caret pointing to the error + if line_text: + msg_parts.append("") + msg_parts.append(f" {line_text}") + prefix = line_text[: column - 1].expandtabs() + msg_parts.append(f" {' ' * len(prefix)}^") + + sys.stderr.write("\n".join(msg_parts) + "\n") + + def xdr_parser() -> Lark: """Return a Lark parser instance configured with the XDR language grammar""" -- cgit v1.2.3 From 4c53b89032f14577e94d747a3ca0aee63f18d856 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Dec 2025 10:19:34 -0500 Subject: xdrgen: Emit a max_arg_sz macro struct svc_service has a .vs_xdrsize field that is filled in by servers for each of their RPC programs. This field is supposed to contain the size of the largest procedure argument in the RPC program. This value is also sometimes used to size network transport buffers. Currently, server implementations must manually calculate and hard-code this value, which is error-prone and requires updates when procedure arguments change. Update xdrgen to determine which procedure argument structure is largest, and emit a macro with a well-known name that contains the size of that structure. Server code then uses this macro when initializing the .vs_xdrsize field. For NLM version 4, xdrgen now emits: #define NLM4_MAX_ARGS_SZ (NLM4_nlm4_lockargs_sz) Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/program.py | 35 +++++++++++++++++++++- tools/net/sunrpc/xdrgen/subcmds/definitions.py | 2 ++ .../xdrgen/templates/C/program/maxsize/max_args.j2 | 3 ++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 (limited to 'tools') diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py index decb092ef02c..c0cb3f6d3319 100644 --- a/tools/net/sunrpc/xdrgen/generators/program.py +++ b/tools/net/sunrpc/xdrgen/generators/program.py @@ -5,8 +5,9 @@ from jinja2 import Environment -from generators import SourceGenerator, create_jinja2_environment +from generators import SourceGenerator, create_jinja2_environment, get_jinja2_template from xdr_ast import _RpcProgram, _RpcVersion, excluded_apis +from xdr_ast import max_widths, get_header_name def emit_version_definitions( @@ -169,3 +170,35 @@ class XdrProgramGenerator(SourceGenerator): emit_version_argument_encoders( self.environment, program, version, ) + + def emit_maxsize(self, node: _RpcProgram) -> None: + """Emit maxsize macro for maximum RPC argument size""" + header = get_header_name().upper() + + # Find the largest argument across all versions + max_arg_width = 0 + max_arg_name = None + for version in node.versions: + for procedure in version.procedures: + if procedure.name in excluded_apis: + continue + arg_name = procedure.argument.type_name + if arg_name == "void": + continue + if arg_name not in max_widths: + continue + if max_widths[arg_name] > max_arg_width: + max_arg_width = max_widths[arg_name] + max_arg_name = arg_name + + if max_arg_name is None: + return + + macro_name = header + "_MAX_ARGS_SZ" + template = get_jinja2_template(self.environment, "maxsize", "max_args") + print( + template.render( + macro=macro_name, + width=header + "_" + max_arg_name + "_sz", + ) + ) diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py index d6c2dcd6f78f..b17526a03dda 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py +++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py @@ -66,6 +66,8 @@ def emit_header_maxsize(root: Specification, language: str, peer: str) -> None: gen = XdrStructGenerator(language, peer) elif isinstance(definition.value, _XdrUnion): gen = XdrUnionGenerator(language, peer) + elif isinstance(definition.value, _RpcProgram): + gen = XdrProgramGenerator(language, peer) else: continue gen.emit_maxsize(definition.value) diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 new file mode 100644 index 000000000000..9f3bfb47d2f4 --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 @@ -0,0 +1,3 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +#define {{ '{:<31}'.format(macro) }} \ + ({{ width }}) -- cgit v1.2.3 From 5288993c4d1a8e59310e007aa68cf9b856551cc6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Dec 2025 10:19:35 -0500 Subject: xdrgen: Add enum value validation to generated decoders XDR enum decoders generated by xdrgen do not verify that incoming values are valid members of the enum. Incoming out-of-range values from malicious or buggy peers propagate through the system unchecked. Add validation logic to generated enum decoders using a switch statement that explicitly lists valid enumerator values. The compiler optimizes this to a simple range check when enum values are dense (contiguous), while correctly rejecting invalid values for sparse enums with gaps in their value ranges. The --no-enum-validation option on the source subcommand disables this validation when not needed. The minimum and maximum fields in _XdrEnum, which were previously unused placeholders for a range-based validation approach, have been removed since the switch-based validation handles both dense and sparse enums correctly. Because the new mechanism results in substantive changes to generated code, existing .x files are regenerated. Unrelated white space and semicolon changes in the generated code are due to recent commit 1c873a2fd110 ("xdrgen: Don't generate unnecessary semicolon") and commit 38c4df91242b ("xdrgen: Address some checkpatch whitespace complaints"). Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr_gen.c | 105 +++++++++++++++++---- fs/nfsd/nfs4xdr_gen.h | 2 +- include/linux/sunrpc/xdrgen/nfs4_1.h | 8 +- tools/net/sunrpc/xdrgen/generators/enum.py | 9 +- tools/net/sunrpc/xdrgen/subcmds/source.py | 3 +- .../sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 | 11 +++ .../xdrgen/templates/C/enum/decoder/enum_be.j2 | 20 ++++ tools/net/sunrpc/xdrgen/xdr_ast.py | 6 +- tools/net/sunrpc/xdrgen/xdr_parse.py | 14 +++ tools/net/sunrpc/xdrgen/xdrgen | 6 ++ 10 files changed, 156 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c index a17b5d8e60b3..1e5e2243625c 100644 --- a/fs/nfsd/nfs4xdr_gen.c +++ b/fs/nfsd/nfs4xdr_gen.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Generated by xdrgen. Manual edits will be lost. // XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x -// XDR specification modification time: Mon Oct 14 09:10:13 2024 +// XDR specification modification time: Thu Dec 25 13:44:43 2025 #include @@ -11,13 +11,13 @@ static bool __maybe_unused xdrgen_decode_int64_t(struct xdr_stream *xdr, int64_t *ptr) { return xdrgen_decode_hyper(xdr, ptr); -}; +} static bool __maybe_unused xdrgen_decode_uint32_t(struct xdr_stream *xdr, uint32_t *ptr) { return xdrgen_decode_unsigned_int(xdr, ptr); -}; +} static bool __maybe_unused xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr) @@ -28,7 +28,7 @@ xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr) if (!xdrgen_decode_uint32_t(xdr, &ptr->element[i])) return false; return true; -}; +} static bool __maybe_unused xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr) @@ -38,13 +38,13 @@ xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr) if (!xdrgen_decode_uint32_t(xdr, &ptr->nseconds)) return false; return true; -}; +} static bool __maybe_unused xdrgen_decode_fattr4_offline(struct xdr_stream *xdr, fattr4_offline *ptr) { return xdrgen_decode_bool(xdr, ptr); -}; +} static bool __maybe_unused xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *ptr) @@ -60,7 +60,7 @@ xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *pt if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_create_mode)) return false; return true; -}; +} static bool __maybe_unused xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 *ptr) @@ -69,6 +69,15 @@ xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_ac if (xdr_stream_decode_u32(xdr, &val) < 0) return false; + /* Compiler may optimize to a range check for dense enums */ + switch (val) { + case OPEN_ARGS_SHARE_ACCESS_READ: + case OPEN_ARGS_SHARE_ACCESS_WRITE: + case OPEN_ARGS_SHARE_ACCESS_BOTH: + break; + default: + return false; + } *ptr = val; return true; } @@ -80,6 +89,16 @@ xdrgen_decode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny if (xdr_stream_decode_u32(xdr, &val) < 0) return false; + /* Compiler may optimize to a range check for dense enums */ + switch (val) { + case OPEN_ARGS_SHARE_DENY_NONE: + case OPEN_ARGS_SHARE_DENY_READ: + case OPEN_ARGS_SHARE_DENY_WRITE: + case OPEN_ARGS_SHARE_DENY_BOTH: + break; + default: + return false; + } *ptr = val; return true; } @@ -91,6 +110,19 @@ xdrgen_decode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_sha if (xdr_stream_decode_u32(xdr, &val) < 0) return false; + /* Compiler may optimize to a range check for dense enums */ + switch (val) { + case OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG: + case OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG: + case OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL: + case OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED: + case OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS: + case OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION: + break; + default: + return false; + } *ptr = val; return true; } @@ -102,6 +134,19 @@ xdrgen_decode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim if (xdr_stream_decode_u32(xdr, &val) < 0) return false; + /* Compiler may optimize to a range check for dense enums */ + switch (val) { + case OPEN_ARGS_OPEN_CLAIM_NULL: + case OPEN_ARGS_OPEN_CLAIM_PREVIOUS: + case OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR: + case OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV: + case OPEN_ARGS_OPEN_CLAIM_FH: + case OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH: + case OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH: + break; + default: + return false; + } *ptr = val; return true; } @@ -113,6 +158,16 @@ xdrgen_decode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode if (xdr_stream_decode_u32(xdr, &val) < 0) return false; + /* Compiler may optimize to a range check for dense enums */ + switch (val) { + case OPEN_ARGS_CREATEMODE_UNCHECKED4: + case OPEN_ARGS_CREATE_MODE_GUARDED: + case OPEN_ARGS_CREATEMODE_EXCLUSIVE4: + case OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1: + break; + default: + return false; + } *ptr = val; return true; } @@ -121,19 +176,19 @@ bool xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr) { return xdrgen_decode_open_arguments4(xdr, ptr); -}; +} bool xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr) { return xdrgen_decode_nfstime4(xdr, ptr); -}; +} bool xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr) { return xdrgen_decode_nfstime4(xdr, ptr); -}; +} static bool __maybe_unused xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr) @@ -142,6 +197,18 @@ xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type if (xdr_stream_decode_u32(xdr, &val) < 0) return false; + /* Compiler may optimize to a range check for dense enums */ + switch (val) { + case OPEN_DELEGATE_NONE: + case OPEN_DELEGATE_READ: + case OPEN_DELEGATE_WRITE: + case OPEN_DELEGATE_NONE_EXT: + case OPEN_DELEGATE_READ_ATTRS_DELEG: + case OPEN_DELEGATE_WRITE_ATTRS_DELEG: + break; + default: + return false; + } *ptr = val; return true; } @@ -150,13 +217,13 @@ static bool __maybe_unused xdrgen_encode_int64_t(struct xdr_stream *xdr, const int64_t value) { return xdrgen_encode_hyper(xdr, value); -}; +} static bool __maybe_unused xdrgen_encode_uint32_t(struct xdr_stream *xdr, const uint32_t value) { return xdrgen_encode_unsigned_int(xdr, value); -}; +} static bool __maybe_unused xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value) @@ -167,7 +234,7 @@ xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value) if (!xdrgen_encode_uint32_t(xdr, value.element[i])) return false; return true; -}; +} static bool __maybe_unused xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value) @@ -177,13 +244,13 @@ xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value) if (!xdrgen_encode_uint32_t(xdr, value->nseconds)) return false; return true; -}; +} static bool __maybe_unused xdrgen_encode_fattr4_offline(struct xdr_stream *xdr, const fattr4_offline value) { return xdrgen_encode_bool(xdr, value); -}; +} static bool __maybe_unused xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_arguments4 *value) @@ -199,7 +266,7 @@ xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_argument if (!xdrgen_encode_bitmap4(xdr, value->oa_create_mode)) return false; return true; -}; +} static bool __maybe_unused xdrgen_encode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 value) @@ -235,19 +302,19 @@ bool xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value) { return xdrgen_encode_open_arguments4(xdr, value); -}; +} bool xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value) { return xdrgen_encode_nfstime4(xdr, value); -}; +} bool xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value) { return xdrgen_encode_nfstime4(xdr, value); -}; +} static bool __maybe_unused xdrgen_encode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 value) diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h index 41a0033b7256..47437876e803 100644 --- a/fs/nfsd/nfs4xdr_gen.h +++ b/fs/nfsd/nfs4xdr_gen.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */ +/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */ #ifndef _LINUX_XDRGEN_NFS4_1_DECL_H #define _LINUX_XDRGEN_NFS4_1_DECL_H diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h index cf21a14aa885..352bffda08f7 100644 --- a/include/linux/sunrpc/xdrgen/nfs4_1.h +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */ +/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */ #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H #define _LINUX_XDRGEN_NFS4_1_DEF_H @@ -40,6 +40,7 @@ enum open_args_share_access4 { OPEN_ARGS_SHARE_ACCESS_WRITE = 2, OPEN_ARGS_SHARE_ACCESS_BOTH = 3, }; + typedef enum open_args_share_access4 open_args_share_access4; enum open_args_share_deny4 { @@ -48,6 +49,7 @@ enum open_args_share_deny4 { OPEN_ARGS_SHARE_DENY_WRITE = 2, OPEN_ARGS_SHARE_DENY_BOTH = 3, }; + typedef enum open_args_share_deny4 open_args_share_deny4; enum open_args_share_access_want4 { @@ -59,6 +61,7 @@ enum open_args_share_access_want4 { OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20, OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21, }; + typedef enum open_args_share_access_want4 open_args_share_access_want4; enum open_args_open_claim4 { @@ -70,6 +73,7 @@ enum open_args_open_claim4 { OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5, OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6, }; + typedef enum open_args_open_claim4 open_args_open_claim4; enum open_args_createmode4 { @@ -78,6 +82,7 @@ enum open_args_createmode4 { OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2, OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3, }; + typedef enum open_args_createmode4 open_args_createmode4; typedef struct open_arguments4 fattr4_open_arguments; @@ -124,6 +129,7 @@ enum open_delegation_type4 { OPEN_DELEGATE_READ_ATTRS_DELEG = 4, OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5, }; + typedef enum open_delegation_type4 open_delegation_type4; #define NFS4_int64_t_sz \ diff --git a/tools/net/sunrpc/xdrgen/generators/enum.py b/tools/net/sunrpc/xdrgen/generators/enum.py index e62f715d3996..b4ed3ed6431e 100644 --- a/tools/net/sunrpc/xdrgen/generators/enum.py +++ b/tools/net/sunrpc/xdrgen/generators/enum.py @@ -5,6 +5,7 @@ from generators import SourceGenerator, create_jinja2_environment from xdr_ast import _XdrEnum, public_apis, big_endian, get_header_name +from xdr_parse import get_xdr_enum_validation class XdrEnumGenerator(SourceGenerator): @@ -42,7 +43,13 @@ class XdrEnumGenerator(SourceGenerator): template = self.environment.get_template("decoder/enum_be.j2") else: template = self.environment.get_template("decoder/enum.j2") - print(template.render(name=node.name)) + print( + template.render( + name=node.name, + enumerators=node.enumerators, + validate=get_xdr_enum_validation(), + ) + ) def emit_encoder(self, node: _XdrEnum) -> None: """Emit one encoder function for an XDR enum type""" diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py index 08c883f547d7..6508563494fe 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/source.py +++ b/tools/net/sunrpc/xdrgen/subcmds/source.py @@ -22,7 +22,7 @@ from xdr_ast import transform_parse_tree, _RpcProgram, Specification from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion -from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import xdr_parser, set_xdr_annotate, set_xdr_enum_validation from xdr_parse import make_error_handler, XdrParseError from xdr_parse import handle_transform_error @@ -98,6 +98,7 @@ def subcmd(args: Namespace) -> int: """Generate encoder and decoder functions""" set_xdr_annotate(args.annotate) + set_xdr_enum_validation(not args.no_enum_validation) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: source = f.read() diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 index 6482984f1cb7..735a34157fdf 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 @@ -14,6 +14,17 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr) if (xdr_stream_decode_u32(xdr, &val) < 0) return false; +{% if validate and enumerators %} + /* Compiler may optimize to a range check for dense enums */ + switch (val) { +{% for e in enumerators %} + case {{ e.name }}: +{% endfor %} + break; + default: + return false; + } +{% endif %} *ptr = val; return true; } diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 index 44c391c10b42..82782a510d47 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 @@ -10,5 +10,25 @@ static bool __maybe_unused {% endif %} xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr) { +{% if validate and enumerators %} + __be32 raw; + u32 val; + + if (xdr_stream_decode_be32(xdr, &raw) < 0) + return false; + val = be32_to_cpu(raw); + /* Compiler may optimize to a range check for dense enums */ + switch (val) { +{% for e in enumerators %} + case {{ e.name }}: +{% endfor %} + break; + default: + return false; + } + *ptr = raw; + return true; +{% else %} return xdr_stream_decode_be32(xdr, ptr) == 0; +{% endif %} } diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py index 2b5d160a0a60..dc2fa9fd8ec2 100644 --- a/tools/net/sunrpc/xdrgen/xdr_ast.py +++ b/tools/net/sunrpc/xdrgen/xdr_ast.py @@ -330,8 +330,6 @@ class _XdrEnum(_XdrAst): """An XDR enum definition""" name: str - minimum: int - maximum: int enumerators: List[_XdrEnumerator] def max_width(self) -> int: @@ -572,8 +570,6 @@ class ParseToAst(Transformer): value = children[1].value return _XdrConstant(name, value) - # cel: Python can compute a min() and max() for the enumerator values - # so that the generated code can perform proper range checking. def enum(self, children): """Instantiate one _XdrEnum object""" enum_name = children[0].symbol @@ -587,7 +583,7 @@ class ParseToAst(Transformer): enumerators.append(_XdrEnumerator(name, value)) i = i + 2 - return _XdrEnum(enum_name, 0, 0, enumerators) + return _XdrEnum(enum_name, enumerators) def fixed_length_opaque(self, children): """Instantiate one _XdrFixedLengthOpaque declaration object""" diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py index 38724ad5aea2..241e96c1fdd9 100644 --- a/tools/net/sunrpc/xdrgen/xdr_parse.py +++ b/tools/net/sunrpc/xdrgen/xdr_parse.py @@ -13,6 +13,9 @@ from lark.exceptions import UnexpectedInput, UnexpectedToken, VisitError # Set to True to emit annotation comments in generated source annotate = False +# Set to True to emit enum value validation in decoders +enum_validation = True + # Map internal Lark token names to human-readable names TOKEN_NAMES = { "__ANON_0": "identifier", @@ -49,6 +52,17 @@ def get_xdr_annotate() -> bool: return annotate +def set_xdr_enum_validation(set_it: bool) -> None: + """Set 'enum_validation' based on command line options""" + global enum_validation + enum_validation = set_it + + +def get_xdr_enum_validation() -> bool: + """Return True when enum validation is enabled for decoder generation""" + return enum_validation + + def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput], bool]: """Create an error handler that reports the first parse error and aborts. diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen index e22638f8324b..b2fb43f4a2ec 100755 --- a/tools/net/sunrpc/xdrgen/xdrgen +++ b/tools/net/sunrpc/xdrgen/xdrgen @@ -123,6 +123,12 @@ There is NO WARRANTY, to the extent permitted by law.""", help="Generate code for client or server side", type=str, ) + source_parser.add_argument( + "--no-enum-validation", + action="store_true", + default=False, + help="Disable enum value validation in decoders", + ) source_parser.add_argument("filename", help="File containing an XDR specification") source_parser.set_defaults(func=source.subcmd) -- cgit v1.2.3 From 78980b4c7fcb5ef74b7af65fbef5ce8d718cf791 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 19 Jan 2026 21:34:17 +0800 Subject: selftests/bpf: Harden cpu flags test for lru_percpu_hash map CI occasionally reports failures in the percpu_alloc/cpu_flag_lru_percpu_hash selftest, for example: First test_progs failure (test_progs_no_alu32-x86_64-llvm-21): #264/15 percpu_alloc/cpu_flag_lru_percpu_hash ... test_percpu_map_op_cpu_flag:FAIL:bpf_map_lookup_batch value on specified cpu unexpected bpf_map_lookup_batch value on specified cpu: actual 0 != expected 3735929054 The unexpected value indicates that an element was removed from the map. However, the test never calls delete_elem(), so the only possible cause is LRU eviction. This can happen when the current task migrates to another CPU: an update_elem() triggers eviction because there is no available LRU node on local freelist and global freelist. Harden the test against this behavior by provisioning sufficient spare elements. Set max_entries to 'nr_cpus * 2' and restrict the test to using the first nr_cpus entries, ensuring that updates do not spuriously trigger LRU eviction. Signed-off-by: Leon Hwang Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260119133417.19739-1-leon.hwang@linux.dev --- tools/testing/selftests/bpf/prog_tests/percpu_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index c1d0949f093f..a72ae0b29f6e 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -236,6 +236,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; /* update values on specified CPU */ for (i = 0; i < entries; i++) @@ -246,6 +248,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; /* lookup values on specified CPU */ batch = 0; @@ -254,6 +258,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; for (i = 0; i < entries; i++) if (!ASSERT_EQ(values[i], value, @@ -269,6 +275,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t &batch_opts); if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; for (i = 0; i < entries; i++) { values_row = (void *) values_percpu + @@ -287,7 +295,6 @@ out: free(values); } - static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) { struct percpu_alloc_array *skel; @@ -300,7 +307,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) return; - max_entries = nr_cpus + 1; + max_entries = nr_cpus * 2; keys = calloc(max_entries, key_sz); if (!ASSERT_OK_PTR(keys, "calloc keys")) return; @@ -322,7 +329,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) if (!ASSERT_OK(err, "test_percpu_alloc__load")) goto out; - test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true); + test_percpu_map_op_cpu_flag(map, keys, key_sz, nr_cpus, nr_cpus, true); out: percpu_alloc_array__destroy(skel); free(keys); -- cgit v1.2.3 From 096b86ce08332fbcb0ec6ff6714c44899ec03970 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 8 Jan 2026 09:43:24 +0000 Subject: tools headers: Go back to include asm-generic/unistd.h for arm64 The header unistd.h is included under Arm64's uAPI folder (see tools/arch/arm64/include/uapi/asm/), but it does not include its dependent header unistd_64.h. The intention is for unistd_64.h to be generated dynamically using scripts/Makefile.asm-headers. However, this dynamic approach causes problems because the header is not available early enough, even though it is widely included throughout tools. Using the perf build as an example: 1) Feature detection: Perf first runs feature tests. The BPF feature program test-bpf.c includes unistd.h. Since unistd_64.h has not been generated yet, the program fails to build, and the BPF feature ends up being disabled. 2) libperf build: The libperf Makefile later generates unistd_64.h on the fly, so libperf itself builds successfully. 3) Final perf build: Although the perf binary can build successfully using the generated header, we never get a chance to build BPF skeleton programs, because BPF support was already disabled earlier. Restore to include asm-generic/unistd.h for fixing the issue. This aligns with most architectures (x86 is a special case that keeps unistd_32.h/unistd_64.h for its particular syscall numbers) and ensures the header is available from the start. Fixes: 22f72088ffe69a37 ("tools headers: Update the syscall table with the kernel sources") Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Arnd Bergmann Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/unistd.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h index df36f23876e8..9306726337fe 100644 --- a/tools/arch/arm64/include/uapi/asm/unistd.h +++ b/tools/arch/arm64/include/uapi/asm/unistd.h @@ -1,2 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#include +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#define __ARCH_WANT_RENAMEAT +#define __ARCH_WANT_NEW_STAT +#define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS +#define __ARCH_WANT_MEMFD_SECRET + +#include -- cgit v1.2.3 From 129bb23a6f7d022610f902b57d36d69d7d210128 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 8 Jan 2026 09:43:25 +0000 Subject: Revert "perf tools: Fix arm64 build by generating unistd_64.h" This reverts: commit 8988c4b91945173a ("perf tools: Fix in-source libperf build") commit bfb713ea53c746b0 ("perf tools: Fix arm64 build by generating unistd_64.h") Since we now have a static unistd_64.h for the arm64 build, there is no need to generate unistd_64.h in libperf. Revert all patches related to generating unistd_64.h. Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Arnd Bergmann Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Makefile | 14 ++------------ tools/perf/Makefile.config | 1 - 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile index 27e6490f64dc..9692d0742ed0 100644 --- a/tools/lib/perf/Makefile +++ b/tools/lib/perf/Makefile @@ -42,7 +42,6 @@ libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) TEST_ARGS := $(if $(V),-v) INCLUDES = \ --I$(OUTPUT)arch/$(SRCARCH)/include/generated/uapi \ -I$(srctree)/tools/lib/perf/include \ -I$(srctree)/tools/lib/ \ -I$(srctree)/tools/include \ @@ -100,16 +99,7 @@ $(LIBAPI)-clean: $(call QUIET_CLEAN, libapi) $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null -uapi-asm := $(OUTPUT)arch/$(SRCARCH)/include/generated/uapi/asm -ifeq ($(SRCARCH),arm64) - syscall-y := $(uapi-asm)/unistd_64.h -endif -uapi-asm-generic: - $(if $(syscall-y),\ - $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.asm-headers obj=$(uapi-asm) \ - generic=include/uapi/asm-generic $(syscall-y),) - -$(LIBPERF_IN): uapi-asm-generic FORCE +$(LIBPERF_IN): FORCE $(Q)$(MAKE) $(build)=libperf $(LIBPERF_A): $(LIBPERF_IN) @@ -130,7 +120,7 @@ all: fixdep clean: $(LIBAPI)-clean $(call QUIET_CLEAN, libperf) $(RM) $(LIBPERF_A) \ *.o *~ *.a *.so *.so.$(VERSION) *.so.$(LIBPERF_VERSION) .*.d .*.cmd tests/*.o LIBPERF-CFLAGS $(LIBPERF_PC) \ - $(TESTS_STATIC) $(TESTS_SHARED) $(syscall-y) + $(TESTS_STATIC) $(TESTS_SHARED) TESTS_IN = tests-in.o diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 5e4ae775987f..63ca9b2be663 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -64,7 +64,6 @@ include $(srctree)/tools/scripts/Makefile.arch $(call detected_var,SRCARCH) CFLAGS += -I$(OUTPUT)arch/$(SRCARCH)/include/generated -CFLAGS += -I$(OUTPUT)libperf/arch/$(SRCARCH)/include/generated/uapi # Additional ARCH settings for ppc ifeq ($(SRCARCH),powerpc) -- cgit v1.2.3 From 9966b382d06733f7467484bb440d6db68b743207 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 8 Jan 2026 09:43:26 +0000 Subject: tools headers: Don't check arm64's unistd.h The arm64 unistd.h in tools now diverges from the kernel header. Comparing the two headers is pointless, remove the check. Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Arnd Bergmann Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/check-headers.sh | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index e0537f275da2..da3aca87457f 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -54,7 +54,6 @@ declare -a FILES=( "arch/s390/include/uapi/asm/kvm.h" "arch/s390/include/uapi/asm/sie.h" "arch/arm64/include/uapi/asm/kvm.h" - "arch/arm64/include/uapi/asm/unistd.h" "arch/alpha/include/uapi/asm/errno.h" "arch/mips/include/asm/errno.h" "arch/mips/include/uapi/asm/errno.h" -- cgit v1.2.3 From dda5f926a1006c735b00ed5c27291fce64236656 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 26 Jan 2026 17:25:00 -0300 Subject: perf annotate: Fix BUILD_NONDISTRO=1 missing args->ms conversions to pointer Fix a few missing conversions to pointer in the usage of 'struct annotate_args' 'ms' member in symbol__disassemble_bpf_libbfd(). Fixes: 00419892bac28bf1 ("perf annotate: Fix args leak of map_symbol") Reviewed-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/libbfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c index 79f4528234a9..63ea3fb53e77 100644 --- a/tools/perf/util/libbfd.c +++ b/tools/perf/util/libbfd.c @@ -501,7 +501,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, struct bpf_prog_info_node *info_node; int len = sym->end - sym->start; disassembler_ftype disassemble; - struct map *map = args->ms.map; + struct map *map = args->ms->map; struct perf_bpil *info_linear; struct disassemble_info info; struct dso *dso = map__dso(map); @@ -612,7 +612,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, args->line = strdup(srcline); args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); if (dl) { annotation_line__add(&dl->al, @@ -624,7 +624,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, args->line = buf + prev_buf_size; args->line_nr = 0; args->fileloc = NULL; - args->ms.sym = sym; + args->ms->sym = sym; dl = disasm_line__new(args); if (dl) annotation_line__add(&dl->al, ¬es->src->source); -- cgit v1.2.3 From 008603bda19b29687edce533e4c09acff68c1077 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 26 Jan 2026 11:18:23 +0100 Subject: perf test: Fix test perf evlist for z/VM s390x Perf test case 'perf evlist tests' fails on z/VM machines on s390. The failure is causes by event cycles. This event is not available on virtualized machines like z/VM on s390. Change to software event cpu-clock to fix this. Output before: # ./perf test 78 79: perf evlist tests : FAILED! # Output after: # ./perf test 78 79: perf evlist tests : Ok # Fixes: b04d2b9199129f4f ("perf test: Fix test case perf evlist tests for s390x") Reviewed-by: Ian Rogers Reviewed-by: Jan Polensky Signed-off-by: Thomas Richter Tested-by: Jan Polensky Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Thomas Richter Cc: Vasily Gorbik Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/evlist.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/evlist.sh b/tools/perf/tests/shell/evlist.sh index 5632be391710..8a22f4171c07 100755 --- a/tools/perf/tests/shell/evlist.sh +++ b/tools/perf/tests/shell/evlist.sh @@ -21,13 +21,13 @@ trap trap_cleanup EXIT TERM INT test_evlist_simple() { echo "Simple evlist test" - if ! perf record -e cycles -o "${perfdata}" true 2> /dev/null + if ! perf record -e cpu-clock -o "${perfdata}" true 2> /dev/null then echo "Simple evlist [Failed record]" err=1 return fi - if ! perf evlist -i "${perfdata}" | grep -q "cycles" + if ! perf evlist -i "${perfdata}" | grep -q "cpu-clock" then echo "Simple evlist [Failed to list event]" err=1 -- cgit v1.2.3 From 76b2cf07a6d2a836108f9c2486d76599f7adf6e8 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 22 Jan 2026 13:39:46 +0530 Subject: perf vendor events amd: Fix Zen 5 MAB allocation events The unit masks for PMCx041 vary across different generations of Zen processors. Fix the Zen 5 events based on PMCx041 as they incorrectly use the same unit masks as that of Zen 4. Fixes: 45c072f2537ab07b ("perf vendor events amd: Add Zen 5 core events") Reported-by: Suyash Mahar Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdzen5/load-store.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json index ff6627a77805..06bbaea15925 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json +++ b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json @@ -70,19 +70,19 @@ "EventName": "ls_mab_alloc.load_store_allocations", "EventCode": "0x41", "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.", - "UMask": "0x3f" + "UMask": "0x07" }, { "EventName": "ls_mab_alloc.hardware_prefetcher_allocations", "EventCode": "0x41", "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.", - "UMask": "0x40" + "UMask": "0x08" }, { "EventName": "ls_mab_alloc.all_allocations", "EventCode": "0x41", "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.", - "UMask": "0x7f" + "UMask": "0x0f" }, { "EventName": "ls_dmnd_fills_from_sys.local_l2", -- cgit v1.2.3 From 7d0ebeb6c0f735d4eddc679283a1de1dea2ae878 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 23 Jan 2026 14:22:06 -0800 Subject: perf dso: Factor out e_machine reading for use in thread Factor out the resilient e_machine reading code in dso so that it may be used in thread. As there is no dso in that case, make the dso optional. This makes some minor other changes as the swap type from the dso cannot be ascertained. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Swapnil Sapkal Cc: Tianyou Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 110 +++++++++++++++++++++++++++++------------------ tools/perf/util/dso.h | 10 +++-- tools/perf/util/thread.c | 5 +-- 3 files changed, 75 insertions(+), 50 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 3b272a6fae24..91c9f7cb9d8c 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1203,6 +1203,68 @@ ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine, return data_read_write_offset(dso, machine, offset, data, size, true); } +static enum dso_swap_type dso_swap_type__from_elf_data(unsigned char eidata) +{ + static const unsigned int endian = 1; + + switch (eidata) { + case ELFDATA2LSB: + /* We are big endian, DSO is little endian. */ + return (*(unsigned char const *)&endian != 1) ? DSO_SWAP__YES : DSO_SWAP__NO; + case ELFDATA2MSB: + /* We are little endian, DSO is big endian. */ + return (*(unsigned char const *)&endian != 0) ? DSO_SWAP__YES : DSO_SWAP__NO; + default: + return DSO_SWAP__UNSET; + } +} + +/* Reads e_machine from fd, optionally caching data in dso. */ +uint16_t dso__read_e_machine(struct dso *optional_dso, int fd) +{ + uint16_t e_machine = EM_NONE; + unsigned char e_ident[EI_NIDENT]; + enum dso_swap_type swap_type; + + _Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset"); + _Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset"); + if (pread(fd, &e_ident, sizeof(e_ident), 0) != sizeof(e_ident)) + return EM_NONE; // Read failed. + + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) + return EM_NONE; // Not an ELF file. + + if (e_ident[EI_CLASS] == ELFCLASSNONE || e_ident[EI_CLASS] >= ELFCLASSNUM) + return EM_NONE; // Bad ELF class (32 or 64-bit objects). + + if (e_ident[EI_VERSION] != EV_CURRENT) + return EM_NONE; // Bad ELF version. + + swap_type = dso_swap_type__from_elf_data(e_ident[EI_DATA]); + if (swap_type == DSO_SWAP__UNSET) + return EM_NONE; // Bad ELF data encoding. + + /* Cache the need for swapping. */ + if (optional_dso) { + assert(dso__needs_swap(optional_dso) == DSO_SWAP__UNSET || + dso__needs_swap(optional_dso) == swap_type); + dso__set_needs_swap(optional_dso, swap_type); + } + + { + _Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset"); + _Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset"); + if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine)) + return EM_NONE; // e_machine read failed. + } + + e_machine = DSO_SWAP_TYPE__SWAP(swap_type, uint16_t, e_machine); + if (e_machine >= EM_NUM) + return EM_NONE; // Bad ELF machine number. + + return e_machine; +} + uint16_t dso__e_machine(struct dso *dso, struct machine *machine) { uint16_t e_machine = EM_NONE; @@ -1248,30 +1310,9 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine) */ try_to_open_dso(dso, machine); fd = dso__data(dso)->fd; - if (fd >= 0) { - unsigned char e_ident[EI_NIDENT]; - - _Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset"); - _Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset"); - if (pread(fd, &e_ident, sizeof(e_ident), 0) == sizeof(e_ident) && - memcmp(e_ident, ELFMAG, SELFMAG) == 0 && - e_ident[EI_CLASS] > ELFCLASSNONE && e_ident[EI_CLASS] < ELFCLASSNUM && - e_ident[EI_DATA] > ELFDATANONE && e_ident[EI_DATA] < ELFDATANUM && - e_ident[EI_VERSION] == EV_CURRENT) { - _Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset"); - _Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset"); - - if (dso__needs_swap(dso) == DSO_SWAP__UNSET) - dso__swap_init(dso, e_ident[EI_DATA]); - - if (dso__needs_swap(dso) != DSO_SWAP__UNSET && - pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine) && - e_machine < EM_NUM) - e_machine = DSO__SWAP(dso, uint16_t, e_machine); - else - e_machine = EM_NONE; - } - } + if (fd >= 0) + e_machine = dso__read_e_machine(dso, fd); + mutex_unlock(dso__data_open_lock()); return e_machine; } @@ -1656,28 +1697,13 @@ void dso__put(struct dso *dso) int dso__swap_init(struct dso *dso, unsigned char eidata) { - static unsigned int const endian = 1; - - dso__set_needs_swap(dso, DSO_SWAP__NO); + enum dso_swap_type type = dso_swap_type__from_elf_data(eidata); - switch (eidata) { - case ELFDATA2LSB: - /* We are big endian, DSO is little endian. */ - if (*(unsigned char const *)&endian != 1) - dso__set_needs_swap(dso, DSO_SWAP__YES); - break; - - case ELFDATA2MSB: - /* We are little endian, DSO is big endian. */ - if (*(unsigned char const *)&endian != 0) - dso__set_needs_swap(dso, DSO_SWAP__YES); - break; - - default: + dso__set_needs_swap(dso, type); + if (type == DSO_SWAP__UNSET) { pr_err("unrecognized DSO data encoding %d\n", eidata); return -EINVAL; } - return 0; } diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index ac725bc8ea74..a95fee7d634b 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -160,12 +160,11 @@ enum dso_load_errno { __DSO_LOAD_ERRNO__END, }; -#define DSO__SWAP(dso, type, val) \ +#define DSO_SWAP_TYPE__SWAP(swap_type, type, val) \ ({ \ type ____r = val; \ - enum dso_swap_type ___dst = dso__needs_swap(dso); \ - BUG_ON(___dst == DSO_SWAP__UNSET); \ - if (___dst == DSO_SWAP__YES) { \ + BUG_ON(swap_type == DSO_SWAP__UNSET); \ + if (swap_type == DSO_SWAP__YES) { \ switch (sizeof(____r)) { \ case 2: \ ____r = bswap_16(val); \ @@ -183,6 +182,8 @@ enum dso_load_errno { ____r; \ }) +#define DSO__SWAP(dso, type, val) DSO_SWAP_TYPE__SWAP(dso__needs_swap(dso), type, val) + #define DSO__DATA_CACHE_SIZE 4096 #define DSO__DATA_CACHE_MASK ~(DSO__DATA_CACHE_SIZE - 1) @@ -865,6 +866,7 @@ int dso__data_file_size(struct dso *dso, struct machine *machine); off_t dso__data_size(struct dso *dso, struct machine *machine); ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine, u64 offset, u8 *data, ssize_t size); +uint16_t dso__read_e_machine(struct dso *optional_dso, int fd); uint16_t dso__e_machine(struct dso *dso, struct machine *machine); ssize_t dso__data_read_addr(struct dso *dso, struct map *map, struct machine *machine, u64 addr, diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index aa9c58bbf9d3..3642858e6cbc 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -458,10 +458,7 @@ static uint16_t read_proc_e_machine_for_pid(pid_t pid) snprintf(path, sizeof(path), "/proc/%d/exe", pid); fd = open(path, O_RDONLY); if (fd >= 0) { - _Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset"); - _Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset"); - if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine)) - e_machine = EM_NONE; + e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd); close(fd); } return e_machine; -- cgit v1.2.3 From 4e66527f8859a6614a3a8afd11778c832a30ebbb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 23 Jan 2026 14:22:07 -0800 Subject: perf thread: Add optional e_flags output argument to thread__e_machine The e_flags are needed to accurately compute complete perf register information for CSKY. Add the ability to read and have this value associated with a thread. This change doesn't wire up the use of the e_flags except in disasm where use already exists but just wasn't set up yet. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Swapnil Sapkal Cc: Tianyou Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 14 +++-- tools/perf/builtin-trace.c | 12 +++-- tools/perf/util/annotate.c | 5 +- tools/perf/util/disasm.c | 5 +- tools/perf/util/disasm.h | 2 +- tools/perf/util/dso.c | 43 +++++++++++++--- tools/perf/util/dso.h | 4 +- .../util/scripting-engines/trace-event-python.c | 2 +- tools/perf/util/session.c | 4 +- tools/perf/util/thread.c | 59 +++++++++++++++------- tools/perf/util/thread.h | 16 +++++- tools/perf/util/unwind-libdw.c | 4 +- 12 files changed, 122 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 372bede30230..8c0de27a9713 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2504,11 +2504,17 @@ static void process_event(struct perf_script *script, symbol_conf.bt_stop_list, fp); } - if (PRINT_FIELD(IREGS)) - perf_sample__fprintf_iregs(sample, attr, thread__e_machine(thread, machine), fp); + if (PRINT_FIELD(IREGS)) { + perf_sample__fprintf_iregs(sample, attr, + thread__e_machine(thread, machine, /*e_flags=*/NULL), + fp); + } - if (PRINT_FIELD(UREGS)) - perf_sample__fprintf_uregs(sample, attr, thread__e_machine(thread, machine), fp); + if (PRINT_FIELD(UREGS)) { + perf_sample__fprintf_uregs(sample, attr, + thread__e_machine(thread, machine, /*e_flags=*/NULL), + fp); + } if (PRINT_FIELD(BRSTACK)) perf_sample__fprintf_brstack(sample, thread, evsel, fp); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8df5ca44e4f9..311d9da9896a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2789,7 +2789,7 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel, struct thread_trace *ttrace; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - e_machine = thread__e_machine(thread, trace->host); + e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL); sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc == NULL) goto out_put; @@ -2868,7 +2868,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel, thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - e_machine = thread__e_machine(thread, trace->host); + e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL); sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc == NULL) goto out_put; @@ -2934,7 +2934,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, struct thread_trace *ttrace; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - e_machine = thread__e_machine(thread, trace->host); + e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL); sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc == NULL) goto out_put; @@ -3285,7 +3285,9 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, if (evsel == trace->syscalls.events.bpf_output) { int id = perf_evsel__sc_tp_uint(evsel, id, sample); - int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST; + int e_machine = thread + ? thread__e_machine(thread, trace->host, /*e_flags=*/NULL) + : EM_HOST; struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc) { @@ -4916,7 +4918,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac { size_t printed = 0; struct thread_trace *ttrace = thread__priv(thread); - int e_machine = thread__e_machine(thread, trace->host); + int e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL); double ratio; if (ttrace == NULL) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index c16c6dfaa959..880b1bd300c2 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -984,6 +984,7 @@ int thread__get_arch(struct thread *thread, const struct arch **parch) { const struct arch *arch; struct machine *machine; + uint32_t e_flags; uint16_t e_machine; if (!thread) { @@ -992,8 +993,8 @@ int thread__get_arch(struct thread *thread, const struct arch **parch) } machine = maps__machine(thread__maps(thread)); - e_machine = thread__e_machine(thread, machine); - arch = arch__find(e_machine, machine->env ? machine->env->cpuid : NULL); + e_machine = thread__e_machine(thread, machine, &e_flags); + arch = arch__find(e_machine, e_flags, machine->env ? machine->env->cpuid : NULL); if (arch == NULL) { pr_err("%s: unsupported arch %d\n", __func__, e_machine); return errno; diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 9b0ba1fc5aec..6b36287f30fe 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -134,7 +134,7 @@ static int arch__cmp(const void *a, const void *b) return e_machine_and_eflags__cmp(&(*aa)->id, &(*ab)->id); } -const struct arch *arch__find(uint16_t e_machine, const char *cpuid) +const struct arch *arch__find(uint16_t e_machine, uint32_t e_flags, const char *cpuid) { static const struct arch *(*const arch_new_fn[])(const struct e_machine_and_e_flags *id, const char *cpuid) = { @@ -157,8 +157,7 @@ const struct arch *arch__find(uint16_t e_machine, const char *cpuid) static size_t num_archs; struct e_machine_and_e_flags key = { .e_machine = e_machine, - // TODO: e_flags should really come from the same source as e_machine. - .e_flags = EF_HOST, + .e_flags = e_flags, }; const struct arch *result = NULL, **tmp; diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 6a1905f9d4fc..a6e478caf61a 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -108,7 +108,7 @@ struct annotate_args { char *fileloc; }; -const struct arch *arch__find(uint16_t e_machine, const char *cpuid); +const struct arch *arch__find(uint16_t e_machine, uint32_t e_flags, const char *cpuid); bool arch__is_x86(const struct arch *arch); bool arch__is_powerpc(const struct arch *arch); diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 91c9f7cb9d8c..b791e1b6b2cf 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1220,14 +1220,20 @@ static enum dso_swap_type dso_swap_type__from_elf_data(unsigned char eidata) } /* Reads e_machine from fd, optionally caching data in dso. */ -uint16_t dso__read_e_machine(struct dso *optional_dso, int fd) +uint16_t dso__read_e_machine(struct dso *optional_dso, int fd, uint32_t *e_flags) { uint16_t e_machine = EM_NONE; unsigned char e_ident[EI_NIDENT]; enum dso_swap_type swap_type; + bool need_e_flags; - _Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset"); - _Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset"); + if (e_flags) + *e_flags = 0; + + { + _Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset"); + _Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset"); + } if (pread(fd, &e_ident, sizeof(e_ident), 0) != sizeof(e_ident)) return EM_NONE; // Read failed. @@ -1254,18 +1260,35 @@ uint16_t dso__read_e_machine(struct dso *optional_dso, int fd) { _Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset"); _Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset"); - if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine)) - return EM_NONE; // e_machine read failed. } + if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine)) + return EM_NONE; // e_machine read failed. e_machine = DSO_SWAP_TYPE__SWAP(swap_type, uint16_t, e_machine); if (e_machine >= EM_NUM) return EM_NONE; // Bad ELF machine number. +#ifdef NDEBUG + /* In production code the e_flags are only needed on CSKY. */ + need_e_flags = e_flags && e_machine == EM_CSKY; +#else + /* Debug code will always read the e_flags. */ + need_e_flags = e_flags != NULL; +#endif + if (need_e_flags) { + off_t offset = e_ident[EI_CLASS] == ELFCLASS32 + ? offsetof(Elf32_Ehdr, e_flags) + : offsetof(Elf64_Ehdr, e_flags); + + if (pread(fd, e_flags, sizeof(*e_flags), offset) != sizeof(*e_flags)) { + *e_flags = 0; + return EM_NONE; // e_flags read failed. + } + } return e_machine; } -uint16_t dso__e_machine(struct dso *dso, struct machine *machine) +uint16_t dso__e_machine(struct dso *dso, struct machine *machine, uint32_t *e_flags) { uint16_t e_machine = EM_NONE; int fd; @@ -1285,6 +1308,8 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine) case DSO_BINARY_TYPE__BPF_IMAGE: case DSO_BINARY_TYPE__OOL: case DSO_BINARY_TYPE__JAVA_JIT: + if (e_flags) + *e_flags = EF_HOST; return EM_HOST; case DSO_BINARY_TYPE__DEBUGLINK: case DSO_BINARY_TYPE__BUILD_ID_CACHE: @@ -1299,6 +1324,8 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine) break; case DSO_BINARY_TYPE__NOT_FOUND: default: + if (e_flags) + *e_flags = 0; return EM_NONE; } @@ -1311,7 +1338,9 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine) try_to_open_dso(dso, machine); fd = dso__data(dso)->fd; if (fd >= 0) - e_machine = dso__read_e_machine(dso, fd); + e_machine = dso__read_e_machine(dso, fd, e_flags); + else if (e_flags) + *e_flags = 0; mutex_unlock(dso__data_open_lock()); return e_machine; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index a95fee7d634b..ede691e9a249 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -866,8 +866,8 @@ int dso__data_file_size(struct dso *dso, struct machine *machine); off_t dso__data_size(struct dso *dso, struct machine *machine); ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine, u64 offset, u8 *data, ssize_t size); -uint16_t dso__read_e_machine(struct dso *optional_dso, int fd); -uint16_t dso__e_machine(struct dso *dso, struct machine *machine); +uint16_t dso__read_e_machine(struct dso *optional_dso, int fd, uint32_t *e_flags); +uint16_t dso__e_machine(struct dso *dso, struct machine *machine, uint32_t *e_flags); ssize_t dso__data_read_addr(struct dso *dso, struct map *map, struct machine *machine, u64 addr, u8 *data, ssize_t size); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index b90edc147796..50f0d16520cc 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -925,7 +925,7 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, if (al->thread) { machine = maps__machine(thread__maps(al->thread)); - e_machine = thread__e_machine(al->thread, machine); + e_machine = thread__e_machine(al->thread, machine, /*e_flags=*/NULL); } if (set_regs_in_dict(dict, sample, evsel, e_machine)) Py_FatalError("Failed to setting regs in dict"); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index c0231bc000e7..0e8a128d7c04 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1124,7 +1124,7 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) { struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid); - e_machine = thread__e_machine(thread, machine); + e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL); } printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n", @@ -2965,7 +2965,7 @@ static int perf_session__e_machine_cb(struct thread *thread, uint16_t *result = arg; struct machine *machine = maps__machine(thread__maps(thread)); - *result = thread__e_machine(thread, machine); + *result = thread__e_machine(thread, machine, /*e_flags=*/NULL); return *result != EM_NONE ? 1 : 0; } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 3642858e6cbc..618f29afb160 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -449,7 +449,7 @@ void thread__find_cpumode_addr_location(struct thread *thread, u64 addr, } } -static uint16_t read_proc_e_machine_for_pid(pid_t pid) +static uint16_t read_proc_e_machine_for_pid(pid_t pid, uint32_t *e_flags) { char path[6 /* "/proc/" */ + 11 /* max length of pid */ + 5 /* "/exe\0" */]; int fd; @@ -458,30 +458,46 @@ static uint16_t read_proc_e_machine_for_pid(pid_t pid) snprintf(path, sizeof(path), "/proc/%d/exe", pid); fd = open(path, O_RDONLY); if (fd >= 0) { - e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd); + e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd, e_flags); close(fd); } return e_machine; } -static int thread__e_machine_callback(struct map *map, void *machine) +struct thread__e_machine_callback_args { + struct machine *machine; + uint32_t e_flags; + uint16_t e_machine; +}; + +static int thread__e_machine_callback(struct map *map, void *_args) { + struct thread__e_machine_callback_args *args = _args; struct dso *dso = map__dso(map); - _Static_assert(0 == EM_NONE, "Unexpected EM_NONE"); if (!dso) - return EM_NONE; + return 0; // No dso, continue search. - return dso__e_machine(dso, machine); + args->e_machine = dso__e_machine(dso, args->machine, &args->e_flags); + return args->e_machine != EM_NONE ? 1 /* stop search */ : 0 /* continue search */; } -uint16_t thread__e_machine(struct thread *thread, struct machine *machine) +uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint32_t *e_flags) { pid_t tid, pid; uint16_t e_machine = RC_CHK_ACCESS(thread)->e_machine; + uint32_t local_e_flags = 0; + struct thread__e_machine_callback_args args = { + .machine = machine, + .e_flags = 0, + .e_machine = EM_NONE, + }; - if (e_machine != EM_NONE) + if (e_machine != EM_NONE) { + if (e_flags) + *e_flags = thread__e_flags(thread); return e_machine; + } tid = thread__tid(thread); pid = thread__pid(thread); @@ -489,18 +505,19 @@ uint16_t thread__e_machine(struct thread *thread, struct machine *machine) struct thread *parent = machine__findnew_thread(machine, pid, pid); if (parent) { - e_machine = thread__e_machine(parent, machine); + e_machine = thread__e_machine(parent, machine, &local_e_flags); thread__put(parent); - thread__set_e_machine(thread, e_machine); - return e_machine; + goto out; } /* Something went wrong, fallback. */ } /* Reading on the PID thread. First try to find from the maps. */ - e_machine = maps__for_each_map(thread__maps(thread), - thread__e_machine_callback, - machine); - if (e_machine == EM_NONE) { + maps__for_each_map(thread__maps(thread), thread__e_machine_callback, &args); + + if (args.e_machine != EM_NONE) { + e_machine = args.e_machine; + local_e_flags = args.e_flags; + } else { /* Maps failed, perhaps we're live with map events disabled. */ bool is_live = machine->machines == NULL; @@ -514,12 +531,18 @@ uint16_t thread__e_machine(struct thread *thread, struct machine *machine) } /* Read from /proc/pid/exe if live. */ if (is_live) - e_machine = read_proc_e_machine_for_pid(pid); + e_machine = read_proc_e_machine_for_pid(pid, &local_e_flags); } - if (e_machine != EM_NONE) +out: + if (e_machine != EM_NONE) { thread__set_e_machine(thread, e_machine); - else + thread__set_e_flags(thread, local_e_flags); + } else { e_machine = EM_HOST; + local_e_flags = EF_HOST; + } + if (e_flags) + *e_flags = local_e_flags; return e_machine; } diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 310eaea344bb..f5792d3e8a16 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -60,6 +60,10 @@ DECLARE_RC_STRUCT(thread) { struct srccode_state srccode_state; bool filter; int filter_entry_depth; + /** + * @e_flags: The ELF EF_* associated with the thread. Valid if e_machine != EM_NONE. + */ + uint16_t e_flags; /** * @e_machine: The ELF EM_* associated with the thread. EM_NONE if not * computed. @@ -307,13 +311,23 @@ static inline void thread__set_filter_entry_depth(struct thread *thread, int dep RC_CHK_ACCESS(thread)->filter_entry_depth = depth; } -uint16_t thread__e_machine(struct thread *thread, struct machine *machine); +uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint32_t *e_flags); static inline void thread__set_e_machine(struct thread *thread, uint16_t e_machine) { RC_CHK_ACCESS(thread)->e_machine = e_machine; } +static inline uint32_t thread__e_flags(const struct thread *thread) +{ + return RC_CHK_ACCESS(thread)->e_flags; +} + +static inline void thread__set_e_flags(struct thread *thread, uint32_t e_flags) +{ + RC_CHK_ACCESS(thread)->e_flags = e_flags; +} + static inline bool thread__lbr_stitch_enable(const struct thread *thread) { diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 9cb0960ef905..3fdcfa06bf22 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -213,7 +213,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * { struct dwfl_ui_thread_info *dwfl_ui_ti = arg; struct unwind_info *ui = dwfl_ui_ti->ui; - uint16_t e_machine = thread__e_machine(ui->thread, ui->machine); + uint16_t e_machine = thread__e_machine(ui->thread, ui->machine, /*e_flags=*/NULL); struct stack_dump *stack = &ui->sample->user_stack; u64 start, end; int offset; @@ -348,7 +348,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, { struct maps *maps = thread__maps(thread); struct machine *machine = maps__machine(maps); - uint16_t e_machine = thread__e_machine(thread, machine); + uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL); struct dwfl_ui_thread_info *dwfl_ui_ti; static struct unwind_info *ui; Dwfl *dwfl; -- cgit v1.2.3 From 0403930f7b1534e383116f7122539873dad3c6a6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 23 Jan 2026 14:22:08 -0800 Subject: perf perf_regs: Accurately compute register names for CSKY CSKY needs the e_flags to determine the ABI level and know whether additional registers are encoded or not. Wire this up now that the e_flags for a thread can be determined. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Swapnil Sapkal Cc: Tianyou Li [ Conditionally define EF_CSKY_ABIMASK and EF_CSKY_ABIV2 for older distros ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 28 +++++++++++++++------- tools/perf/util/parse-regs-options.c | 4 ++-- tools/perf/util/perf-regs-arch/perf_regs_csky.c | 19 ++++++++++----- tools/perf/util/perf_regs.c | 4 ++-- tools/perf/util/perf_regs.h | 4 ++-- .../util/scripting-engines/trace-event-python.c | 17 +++++++------ tools/perf/util/session.c | 24 ++++++++++--------- 7 files changed, 62 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8c0de27a9713..6ec225c697a4 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -717,7 +717,8 @@ out: return 0; } -static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, +static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, + uint16_t e_machine, uint32_t e_flags, FILE *fp) { unsigned i = 0, r; @@ -730,7 +731,9 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, uint for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) { u64 val = regs->regs[i++]; - printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, e_machine), val); + printed += fprintf(fp, "%5s:0x%"PRIx64" ", + perf_reg_name(r, e_machine, e_flags), + val); } return printed; @@ -787,23 +790,29 @@ tod_scnprintf(struct perf_script *script, char *buf, int buflen, } static int perf_sample__fprintf_iregs(struct perf_sample *sample, - struct perf_event_attr *attr, uint16_t e_machine, FILE *fp) + struct perf_event_attr *attr, + uint16_t e_machine, + uint32_t e_flags, + FILE *fp) { if (!sample->intr_regs) return 0; return perf_sample__fprintf_regs(perf_sample__intr_regs(sample), - attr->sample_regs_intr, e_machine, fp); + attr->sample_regs_intr, e_machine, e_flags, fp); } static int perf_sample__fprintf_uregs(struct perf_sample *sample, - struct perf_event_attr *attr, uint16_t e_machine, FILE *fp) + struct perf_event_attr *attr, + uint16_t e_machine, + uint32_t e_flags, + FILE *fp) { if (!sample->user_regs) return 0; return perf_sample__fprintf_regs(perf_sample__user_regs(sample), - attr->sample_regs_user, e_machine, fp); + attr->sample_regs_user, e_machine, e_flags, fp); } static int perf_sample__fprintf_start(struct perf_script *script, @@ -2418,6 +2427,7 @@ static void process_event(struct perf_script *script, struct evsel_script *es = evsel->priv; FILE *fp = es->fp; char str[PAGE_SIZE_NAME_LEN]; + uint32_t e_flags; if (output[type].fields == 0) return; @@ -2506,13 +2516,15 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(IREGS)) { perf_sample__fprintf_iregs(sample, attr, - thread__e_machine(thread, machine, /*e_flags=*/NULL), + thread__e_machine(thread, machine, &e_flags), + e_flags, fp); } if (PRINT_FIELD(UREGS)) { perf_sample__fprintf_uregs(sample, attr, - thread__e_machine(thread, machine, /*e_flags=*/NULL), + thread__e_machine(thread, machine, &e_flags), + e_flags, fp); } diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c index c0d0ef9fd495..8dd35f50f644 100644 --- a/tools/perf/util/parse-regs-options.c +++ b/tools/perf/util/parse-regs-options.c @@ -21,7 +21,7 @@ static void list_perf_regs(FILE *fp, uint64_t mask) if (((1ULL << reg) & mask) == 0) continue; - name = perf_reg_name(reg, EM_HOST); + name = perf_reg_name(reg, EM_HOST, EF_HOST); if (name && (!last_name || strcmp(last_name, name))) fprintf(fp, "%s%s", reg > 0 ? " " : "", name); last_name = name; @@ -39,7 +39,7 @@ static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask) if (((1ULL << reg) & mask) == 0) continue; - name = perf_reg_name(reg, EM_HOST); + name = perf_reg_name(reg, EM_HOST, EF_HOST); if (!name) continue; diff --git a/tools/perf/util/perf-regs-arch/perf_regs_csky.c b/tools/perf/util/perf-regs-arch/perf_regs_csky.c index 75b461ef2eba..95808f93d45b 100644 --- a/tools/perf/util/perf-regs-arch/perf_regs_csky.c +++ b/tools/perf/util/perf-regs-arch/perf_regs_csky.c @@ -1,10 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 - +#include +#ifndef EF_CSKY_ABIMASK +#define EF_CSKY_ABIMASK 0XF0000000 +#endif +#ifndef EF_CSKY_ABIV2 +#define EF_CSKY_ABIV2 0X20000000 +#endif #include "../perf_regs.h" +#undef __CSKYABIV2__ +#define __CSKYABIV2__ 1 // Always want the V2 register definitions. #include "../../arch/csky/include/uapi/asm/perf_regs.h" -const char *__perf_reg_name_csky(int id) +const char *__perf_reg_name_csky(int id, uint32_t e_flags) { + if (id >= PERF_REG_CSKY_EXREGS0 && (e_flags & EF_CSKY_ABIMASK) == EF_CSKY_ABIV2) + return NULL; + switch (id) { case PERF_REG_CSKY_A0: return "a0"; @@ -40,7 +51,6 @@ const char *__perf_reg_name_csky(int id) return "lr"; case PERF_REG_CSKY_PC: return "pc"; -#if defined(__CSKYABIV2__) case PERF_REG_CSKY_EXREGS0: return "exregs0"; case PERF_REG_CSKY_EXREGS1: @@ -77,12 +87,9 @@ const char *__perf_reg_name_csky(int id) return "hi"; case PERF_REG_CSKY_LO: return "lo"; -#endif default: return NULL; } - - return NULL; } uint64_t __perf_reg_ip_csky(void) diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index cd5acee3dc62..14b7be30ab20 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -23,7 +23,7 @@ uint64_t __weak arch__user_reg_mask(void) return 0; } -const char *perf_reg_name(int id, uint16_t e_machine) +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags) { const char *reg_name = NULL; @@ -35,7 +35,7 @@ const char *perf_reg_name(int id, uint16_t e_machine) reg_name = __perf_reg_name_arm64(id); break; case EM_CSKY: - reg_name = __perf_reg_name_csky(id); + reg_name = __perf_reg_name_csky(id, e_flags); break; case EM_LOONGARCH: reg_name = __perf_reg_name_loongarch(id); diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h index 2c2a8de6912d..ed7c1b1358fa 100644 --- a/tools/perf/util/perf_regs.h +++ b/tools/perf/util/perf_regs.h @@ -16,7 +16,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op); uint64_t arch__intr_reg_mask(void); uint64_t arch__user_reg_mask(void); -const char *perf_reg_name(int id, uint16_t e_machine); +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags); int perf_reg_value(u64 *valp, struct regs_dump *regs, int id); uint64_t perf_arch_reg_ip(uint16_t e_machine); uint64_t perf_arch_reg_sp(uint16_t e_machine); @@ -26,7 +26,7 @@ uint64_t __perf_reg_sp_arm64(void); const char *__perf_reg_name_arm(int id); uint64_t __perf_reg_ip_arm(void); uint64_t __perf_reg_sp_arm(void); -const char *__perf_reg_name_csky(int id); +const char *__perf_reg_name_csky(int id, uint32_t e_flags); uint64_t __perf_reg_ip_csky(void); uint64_t __perf_reg_sp_csky(void); const char *__perf_reg_name_loongarch(int id); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 50f0d16520cc..62c9c73daef5 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -714,7 +714,8 @@ static void set_sample_datasrc_in_dict(PyObject *dict, _PyUnicode_FromString(decode)); } -static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, char *bf, int size) +static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, uint32_t e_flags, + char *bf, int size) { unsigned int i = 0, r; int printed = 0; @@ -732,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, printed += scnprintf(bf + printed, size - printed, "%5s:0x%" PRIx64 " ", - perf_reg_name(r, e_machine), val); + perf_reg_name(r, e_machine, e_flags), val); } } @@ -741,7 +742,8 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, static int set_regs_in_dict(PyObject *dict, struct perf_sample *sample, struct evsel *evsel, - uint16_t e_machine) + uint16_t e_machine, + uint32_t e_flags) { struct perf_event_attr *attr = &evsel->core.attr; @@ -753,7 +755,7 @@ static int set_regs_in_dict(PyObject *dict, if (!bf) return -1; - regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, bf, size); + regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, e_flags, bf, size); pydict_set_item_string_decref(dict, "iregs", _PyUnicode_FromString(bf)); @@ -765,7 +767,7 @@ static int set_regs_in_dict(PyObject *dict, if (!bf) return -1; } - regs_map(sample->user_regs, attr->sample_regs_user, e_machine, bf, size); + regs_map(sample->user_regs, attr->sample_regs_user, e_machine, e_flags, bf, size); pydict_set_item_string_decref(dict, "uregs", _PyUnicode_FromString(bf)); @@ -837,6 +839,7 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, PyObject *dict, *dict_sample, *brstack, *brstacksym; struct machine *machine; uint16_t e_machine = EM_HOST; + uint32_t e_flags = EF_HOST; dict = PyDict_New(); if (!dict) @@ -925,9 +928,9 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, if (al->thread) { machine = maps__machine(thread__maps(al->thread)); - e_machine = thread__e_machine(al->thread, machine, /*e_flags=*/NULL); + e_machine = thread__e_machine(al->thread, machine, &e_flags); } - if (set_regs_in_dict(dict, sample, evsel, e_machine)) + if (set_regs_in_dict(dict, sample, evsel, e_machine, e_flags)) Py_FatalError("Failed to setting regs in dict"); return dict; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0e8a128d7c04..7c7c65b0f536 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -959,7 +959,7 @@ static void branch_stack__printf(struct perf_sample *sample, } } -static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine) +static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags) { unsigned rid, i = 0; @@ -967,7 +967,7 @@ static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine) u64 val = regs[i++]; printf(".... %-5s 0x%016" PRIx64 "\n", - perf_reg_name(rid, e_machine), val); + perf_reg_name(rid, e_machine, e_flags), val); } } @@ -985,7 +985,8 @@ static inline const char *regs_dump_abi(struct regs_dump *d) return regs_abi[d->abi]; } -static void regs__printf(const char *type, struct regs_dump *regs, uint16_t e_machine) +static void regs__printf(const char *type, struct regs_dump *regs, + uint16_t e_machine, uint32_t e_flags) { u64 mask = regs->mask; @@ -994,10 +995,10 @@ static void regs__printf(const char *type, struct regs_dump *regs, uint16_t e_ma mask, regs_dump_abi(regs)); - regs_dump__printf(mask, regs->regs, e_machine); + regs_dump__printf(mask, regs->regs, e_machine, e_flags); } -static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine) +static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags) { struct regs_dump *user_regs; @@ -1007,10 +1008,10 @@ static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine) user_regs = perf_sample__user_regs(sample); if (user_regs->regs) - regs__printf("user", user_regs, e_machine); + regs__printf("user", user_regs, e_machine, e_flags); } -static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine) +static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags) { struct regs_dump *intr_regs; @@ -1020,7 +1021,7 @@ static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine) intr_regs = perf_sample__intr_regs(sample); if (intr_regs->regs) - regs__printf("intr", intr_regs, e_machine); + regs__printf("intr", intr_regs, e_machine, e_flags); } static void stack_user__printf(struct stack_dump *dump) @@ -1115,6 +1116,7 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf u64 sample_type; char str[PAGE_SIZE_NAME_LEN]; uint16_t e_machine = EM_NONE; + uint32_t e_flags = 0; if (!dump_trace) return; @@ -1124,7 +1126,7 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) { struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid); - e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL); + e_machine = thread__e_machine(thread, machine, &e_flags); } printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n", @@ -1138,10 +1140,10 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf branch_stack__printf(sample, evsel); if (sample_type & PERF_SAMPLE_REGS_USER) - regs_user__printf(sample, e_machine); + regs_user__printf(sample, e_machine, e_flags); if (sample_type & PERF_SAMPLE_REGS_INTR) - regs_intr__printf(sample, e_machine); + regs_intr__printf(sample, e_machine, e_flags); if (sample_type & PERF_SAMPLE_STACK_USER) stack_user__printf(&sample->user_stack); -- cgit v1.2.3 From 2becdd163ab37c9dca05f31da7e943f59f55e510 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 23 Jan 2026 14:22:09 -0800 Subject: perf unwind-libdw: Wire up e_flags for CSKY Wire up the e_flags now it can be read for a thread. The e_flags encode the CSKY ABI level and this can impact which perf registers need setting up for unwinding. Signed-off-by: Ian Rogers Cc: Aditya Bodkhe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chun-Tse Shao Cc: Guo Ren Cc: Howard Chu Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sergei Trofimovich Cc: Shimin Guo Cc: Stephen Brennan Cc: Swapnil Sapkal Cc: Tianyou Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/unwind-libdw.c | 9 +++++---- tools/perf/util/unwind-libdw.h | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 3fdcfa06bf22..05e8e68bd49c 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -213,7 +213,6 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * { struct dwfl_ui_thread_info *dwfl_ui_ti = arg; struct unwind_info *ui = dwfl_ui_ti->ui; - uint16_t e_machine = thread__e_machine(ui->thread, ui->machine, /*e_flags=*/NULL); struct stack_dump *stack = &ui->sample->user_stack; u64 start, end; int offset; @@ -223,7 +222,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word * return false; ret = perf_reg_value(&start, ui->sample->user_regs, - perf_arch_reg_sp(e_machine)); + perf_arch_reg_sp(ui->e_machine)); if (ret) return false; @@ -260,7 +259,7 @@ static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg) int max_dwarf_reg = 0; bool ret; uint16_t e_machine = ui->e_machine; - int e_flags = 0; + int e_flags = ui->e_flags; uint64_t ip_perf_reg = perf_arch_reg_ip(e_machine); Dwarf_Word val = 0; @@ -348,7 +347,8 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, { struct maps *maps = thread__maps(thread); struct machine *machine = maps__machine(maps); - uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL); + uint32_t e_flags = 0; + uint16_t e_machine = thread__e_machine(thread, machine, &e_flags); struct dwfl_ui_thread_info *dwfl_ui_ti; static struct unwind_info *ui; Dwfl *dwfl; @@ -370,6 +370,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, .arg = arg, .max_stack = max_stack, .e_machine = e_machine, + .e_flags = e_flags, .best_effort = best_effort }; diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h index 3dec0ab8bd50..6423bf5a2492 100644 --- a/tools/perf/util/unwind-libdw.h +++ b/tools/perf/util/unwind-libdw.h @@ -20,6 +20,7 @@ struct unwind_info { void *arg; int max_stack; int idx; + uint32_t e_flags; uint16_t e_machine; bool best_effort; struct unwind_entry entries[]; -- cgit v1.2.3 From 0a6fb6604746c92bccc71867fd0bf3d3294335d1 Mon Sep 17 00:00:00 2001 From: Hrishikesh Suresh Date: Sun, 25 Jan 2026 21:06:52 +0100 Subject: perf session: Print all machines in session dump perf_session__fprintf() prints only the host. This has been changed to print details of host and all guests, by traversing through the RB-Tree. These are visible when using high verbosity (-vvvv) in KVM environments, during perf report dumps. Testing: - Test 1: Record the local machine and guest VM using 'perf kvm record' and generate the report using 'perf kvm report -vvvv -D'. The dump should show the threads and other details related to local and guest machine. - 1 Ubuntu VM running on Fedora host - VM is running a noisy program => $ dd if=/dev/urandom of=/dev/null - On host run => $ sudo ./perf kvm --guestvmlinux=/tmp/shared/guest_vmlinux \ --guestkallsyms=/tmp/shared/guest_kallsyms \ --guestmodules=/tmp/shared/guest_modules \ record -a -g -o perf.data.guest and exit after a few seconds. [ perf record: Woken up 9 times to write data ] [ perf record: Captured and wrote 3.150 MB perf.data.guest \ (29311 samples) ] - Generate dump => $ sudo ./perf kvm --guestkallsyms /tmp/shared/guest_kallsyms \ report -vvvv -D -i perf.data.guest > output.txt - Check for threads associated with guest machine. $ grep "Thread 0" output.txt Thread 0 swapper Thread 0 [guest/0] PASS - Test 2: Record the local machine and guest VM using 'perf kvm record' and generate the report using 'perf kvm report'. The functions running on guest VM should be seen in the report. - Same setup as Test 1 but the test looks at the performance profile, to check if the function names are visible. - Peek into profile using => $ sudo ./perf kvm --guestkallsyms /tmp/shared/guest_kallsyms \ report -i perf.data.guest - Samples: 29K of event 'cycles', Event count (approx.): 28711693142 Children Self Command Shared Object Symbol 35.69% 35.69% :5820 [guest.kernel.kallsyms] [g] chacha_permute 11.56% 11.56% :5820 [guest.kernel.kallsyms] [g] entry_SYSRETQ_unsXXX 11.12% 11.12% :5820 [guest.kernel.kallsyms] [g] syscall_return_viXXX 7.36% 7.36% :5820 [guest.kernel.kallsyms] [g] entry_SYSCALL_64_XXX 6.07% 6.07% :5820 [guest.kernel.kallsyms] [g] chacha_block_generic 5.40% 5.40% :5820 [guest.kernel.kallsyms] [g] _copy_to_iter .... PASS - Test 3: Record the local and 2 guest VMs using 'perf kvm record' and generate the report using 'perf kvm report -vvvv -D'. The dump should show the threads and other details related to local and guest machines. - 1 Ubuntu and 1 Alpine VMs running on Fedora host. - Find PIDs of qemu instances and use them during record and report $ pgrep qemu 5816 25098 - Record the activity => $ sudo ./perf kvm record -p 5816,25098 -a -g -o perf.data.guests Warning: PID/TID switch overriding SYSTEM [ perf record: Woken up 325927 times to write data ] [ perf record: Captured and wrote 3.692 MB perf.data.guests \ (57389 samples) ] - Generate dump => $ sudo ./perf kvm report -vvvv -D -i perf.data.guests > output.txt - Check if the threads related to the local machine and guest VMs are present => $ grep "Thread 0" output.txt Thread 0 swapper Thread 0 [guest/0] NOTE: Threads from Ubuntu and Alpine VMs are bundled together and appear as one guest machine. Looking into output.txt => Threads: 6 Thread 0 [guest/0] Thread 5816 :5816 Thread 25098 :25098 Thread 5819 :5819 Thread 5820 :5820 Thread 25103 :25103 To conclude, information is collected for both VMs and not listed as two different guest machines. PASS - Test 4: Check if any guest-related information is printed in perf annotate. This test is included because the command calls perf_session__fprintf() in its code path when using -vvvv option. This could be explained by inability / lack of options for 'perf annotate' to look into guest VM from host machine, due to no option to specify the guest's kallsyms or modules. A similar explanation for 'perf mem' could be used, as perf_session__fprintf() is also present in its code path. - Run annotate => $ sudo ./perf annotate -i perf.data.guest -vvvv > output.txt - Check for threads from local machine or guest VM => $ grep "Thread 0" output.txt Thread 0 swapper Threads from local machine are found while threads from guest VM are not found. It is possibly because of a lack of a guest kallsyms option for DSO matching in perf annotate. PASS - Test 5: Run kvm test available on perf path - $ sudo ./perf test kvm 89: perf kvm tests : Ok PASS Signed-off-by: Hrishikesh Suresh Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Blake Jones Cc: Chun-Tse Shao Cc: Dmitriy Vyukov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra [ Declare 'nd' in the 'for' line and and 'pos' inside the loop body, to make it more compact ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 7c7c65b0f536..ae62d5c9889f 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2730,11 +2730,14 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) size_t perf_session__fprintf(struct perf_session *session, FILE *fp) { - /* - * FIXME: Here we have to actually print all the machines in this - * session, not just the host... - */ - return machine__fprintf(&session->machines.host, fp); + size_t ret = machine__fprintf(&session->machines.host, fp); + + for (struct rb_node *nd = rb_first_cached(&session->machines.guests); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + + ret += machine__fprintf(pos, fp); + } + return ret; } void perf_session__dump_kmaps(struct perf_session *session) -- cgit v1.2.3 From f21fae57744607330ae5dabdd996538faac7f5ab Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Fri, 23 Jan 2026 09:30:08 +0100 Subject: selftests/bpf: Add a few helpers for bpftool testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to integrate some bpftool tests into test_progs, define a few specific helpers that allow to execute bpftool commands, while possibly retrieving the command output. Those helpers most notably set the path to the bpftool binary under test. This version checks different possible paths relative to the directories where the different test_progs runners are executed, as we want to make sure not to accidentally use a bootstrap version of the binary. Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-1-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/bpftool_helpers.c | 74 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/bpftool_helpers.h | 11 ++++ 3 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.c create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9488d076c740..2cc559ecc723 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -747,7 +747,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ json_writer.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ - ip_check_defrag_frags.h + ip_check_defrag_frags.h \ + bpftool_helpers.c TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c new file mode 100644 index 000000000000..a5824945a4a5 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "bpftool_helpers.h" +#include +#include +#include + +#define BPFTOOL_PATH_MAX_LEN 64 +#define BPFTOOL_FULL_CMD_MAX_LEN 512 + +#define BPFTOOL_DEFAULT_PATH "tools/sbin/bpftool" + +static int detect_bpftool_path(char *buffer) +{ + char tmp[BPFTOOL_PATH_MAX_LEN]; + + /* Check default bpftool location (will work if we are running the + * default flavor of test_progs) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Check alternate bpftool location (will work if we are running a + * specific flavor of test_progs, e.g. cpuv4 or no_alu32) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Failed to find bpftool binary */ + return 1; +} + +static int run_command(char *args, char *output_buf, size_t output_max_len) +{ + static char bpftool_path[BPFTOOL_PATH_MAX_LEN] = {0}; + bool suppress_output = !(output_buf && output_max_len); + char command[BPFTOOL_FULL_CMD_MAX_LEN]; + FILE *f; + int ret; + + /* Detect and cache bpftool binary location */ + if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path)) + return 1; + + ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s", + bpftool_path, args, + suppress_output ? " > /dev/null 2>&1" : ""); + + f = popen(command, "r"); + if (!f) + return 1; + + if (!suppress_output) + fread(output_buf, 1, output_max_len, f); + ret = pclose(f); + + return ret; +} + +int run_bpftool_command(char *args) +{ + return run_command(args, NULL, 0); +} + +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len) +{ + return run_command(args, output_buf, output_max_len); +} + diff --git a/tools/testing/selftests/bpf/bpftool_helpers.h b/tools/testing/selftests/bpf/bpftool_helpers.h new file mode 100644 index 000000000000..dec1ba201410 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#pragma once + +#include +#include +#include + +#define MAX_BPFTOOL_CMD_LEN (256) + +int run_bpftool_command(char *args); +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len); -- cgit v1.2.3 From 1c0b505908a201054dadc87930f550bea2631c1e Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Fri, 23 Jan 2026 09:30:09 +0100 Subject: selftests/bpf: convert test_bpftool_metadata.sh into test_progs framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_bpftool_metadata.sh script validates that bpftool properly returns in its ouptput any metadata generated by bpf programs through some .rodata sections. Port this test to the test_progs framework so that it can be executed automatically in CI. The new test, similarly to the former script, checks that valid data appears both for textual output and json output, as well as for both data not used at all and used data. For the json check part, the expected json string is hardcoded to avoid bringing a new external dependency (eg: a json deserializer) for test_progs. As the test is now converted into test_progs, remove the former script. The newly converted test brings two new subtests: #37/1 bpftool_metadata/metadata_unused:OK #37/2 bpftool_metadata/metadata_used:OK #37 bpftool_metadata:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-2-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/bpftool_metadata.c | 144 +++++++++++++++++++++ .../testing/selftests/bpf/test_bpftool_metadata.sh | 85 ------------ 3 files changed, 144 insertions(+), 86 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c delete mode 100755 tools/testing/selftests/bpf/test_bpftool_metadata.sh (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2cc559ecc723..1bb7db1ed6ea 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -109,7 +109,6 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_map.sh \ - test_bpftool_metadata.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c new file mode 100644 index 000000000000..408ace90dc7e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#define BPFFS_DIR "/sys/fs/bpf/test_metadata" +#define BPFFS_USED BPFFS_DIR "/used" +#define BPFFS_UNUSED BPFFS_DIR "/unused" + +#define BPF_FILE_USED "metadata_used.bpf.o" +#define BPF_FILE_UNUSED "metadata_unused.bpf.o" +#define METADATA_MAP_NAME "metadata.rodata" + +#define MAX_BPFTOOL_OUTPUT_LEN (64*1024) + +#define MAX_TOKENS_TO_CHECK 3 +static char output[MAX_BPFTOOL_OUTPUT_LEN]; + +struct test_desc { + char *name; + char *bpf_prog; + char *bpffs_path; + char *expected_output[MAX_TOKENS_TO_CHECK]; + char *expected_output_json[MAX_TOKENS_TO_CHECK]; + char *metadata_map_name; +}; + +static int setup(struct test_desc *test) +{ + return mkdir(BPFFS_DIR, 0700); +} + +static void cleanup(struct test_desc *test) +{ + unlink(test->bpffs_path); + rmdir(BPFFS_DIR); +} + +static int check_metadata(char *buf, char * const *tokens, int count) +{ + int i; + + for (i = 0; i < count && tokens[i]; i++) + if (!strstr(buf, tokens[i])) + return 1; + + return 0; +} + +static void run_test(struct test_desc *test) +{ + int ret; + char cmd[MAX_BPFTOOL_CMD_LEN]; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog load %s %s", + test->bpf_prog, test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format prog insert command")) + return; + ret = run_bpftool_command(cmd); + if (!ASSERT_OK(ret, "load program")) + return; + + /* Check output with default format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info")) { + ret = check_metadata(output, test->expected_output, + ARRAY_SIZE(test->expected_output)); + ASSERT_OK(ret, "find metadata"); + } + + /* Check output with json format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog -j show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command in json")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info in json")) { + ret = check_metadata(output, test->expected_output_json, + ARRAY_SIZE(test->expected_output_json)); + ASSERT_OK(ret, "find metadata in json"); + } + + /* Check that the corresponding map can be found and accessed */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map show name %s", + test->metadata_map_name); + if (!ASSERT_GT(ret, 0, "format map check command")) + return; + ASSERT_OK(run_bpftool_command(cmd), "access metadata map"); +} + +static struct test_desc tests[] = { + { + .name = "metadata_unused", + .bpf_prog = BPF_FILE_UNUSED, + .bpffs_path = BPFFS_UNUSED, + .expected_output = { + "a = \"foo\"", + "b = 1" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"foo\",\"b\":1}" + }, + .metadata_map_name = METADATA_MAP_NAME + }, + { + .name = "metadata_used", + .bpf_prog = BPF_FILE_USED, + .bpffs_path = BPFFS_USED, + .expected_output = { + "a = \"bar\"", + "b = 2" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"bar\",\"b\":2}" + }, + .metadata_map_name = METADATA_MAP_NAME + } +}; +static const int tests_count = ARRAY_SIZE(tests); + +void test_bpftool_metadata(void) +{ + int i; + + for (i = 0; i < tests_count; i++) { + if (!test__start_subtest(tests[i].name)) + continue; + if (ASSERT_OK(setup(&tests[i]), "setup bpffs pin dir")) { + run_test(&tests[i]); + cleanup(&tests[i]); + } + } +} diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh deleted file mode 100755 index b5520692f41b..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_metadata.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -BPF_FILE_USED="metadata_used.bpf.o" -BPF_FILE_UNUSED="metadata_unused.bpf.o" - -TESTNAME=bpftool_metadata -BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_DIR=$BPF_FS/test_$TESTNAME - -_cleanup() -{ - set +e - rm -rf $BPF_DIR 2> /dev/null -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -if [ $(id -u) -ne 0 ]; then - echo "selftests: $TESTNAME [SKIP] Need root privileges" - exit $ksft_skip -fi - -if [ -z "$BPF_FS" ]; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" - exit $ksft_skip -fi - -if ! bpftool version > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" - exit $ksft_skip -fi - -set -e - -trap cleanup_skip EXIT - -mkdir $BPF_DIR - -trap cleanup EXIT - -bpftool prog load $BPF_FILE_UNUSED $BPF_DIR/unused - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/unused - -bpftool prog load $BPF_FILE_USED $BPF_DIR/used - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/used - -exit 0 -- cgit v1.2.3 From 2d96bbdfd3b5d28306001036c0161fcb1713f964 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Fri, 23 Jan 2026 09:30:10 +0100 Subject: selftests/bpf: convert test_bpftool_map_access.sh into test_progs framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_bpftool_map.sh script tests that maps read/write accesses are being properly allowed/refused by the kernel depending on a specific fmod_ret program being attached on security_bpf_map function. Rewrite this test to integrate it in the test_progs. The new test spawns a few subtests: #36/1 bpftool_maps_access/unprotected_unpinned:OK #36/2 bpftool_maps_access/unprotected_pinned:OK #36/3 bpftool_maps_access/protected_unpinned:OK #36/4 bpftool_maps_access/protected_pinned:OK #36/5 bpftool_maps_access/nested_maps:OK #36/6 bpftool_maps_access/btf_list:OK #36 bpftool_maps_access:OK Summary: 1/6 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Alexis Lothoré (eBPF Foundation) Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-3-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/bpftool_maps_access.c | 371 +++++++++++++++++++ tools/testing/selftests/bpf/test_bpftool_map.sh | 398 --------------------- 3 files changed, 371 insertions(+), 399 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c delete mode 100755 tools/testing/selftests/bpf/test_bpftool_map.sh (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1bb7db1ed6ea..2c2f68a171ed 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -108,7 +108,6 @@ TEST_PROGS := test_kmod.sh \ test_xdping.sh \ test_bpftool_build.sh \ test_bpftool.sh \ - test_bpftool_map.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c new file mode 100644 index 000000000000..e0eb869cb1b4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "security_bpf_map.skel.h" + +#define PROTECTED_MAP_NAME "prot_map" +#define UNPROTECTED_MAP_NAME "not_prot_map" +#define BPF_ITER_FILE "bpf_iter_map_elem.bpf.o" +#define BPFFS_PIN_DIR "/sys/fs/bpf/test_bpftool_map" +#define INNER_MAP_NAME "inner_map_tt" +#define OUTER_MAP_NAME "outer_map_tt" + +#define MAP_NAME_MAX_LEN 64 +#define PATH_MAX_LEN 128 + +enum map_protection { + PROTECTED, + UNPROTECTED +}; + +struct test_desc { + char *name; + enum map_protection protection; + struct bpf_map *map; + char *map_name; + bool pinned; + char pin_path[PATH_MAX_LEN]; + bool write_must_fail; +}; + +static struct security_bpf_map *general_setup(void) +{ + struct security_bpf_map *skel; + uint32_t key, value; + int ret, i; + + skel = security_bpf_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto end; + + struct bpf_map *maps[] = {skel->maps.prot_map, skel->maps.not_prot_map}; + + ret = security_bpf_map__attach(skel); + if (!ASSERT_OK(ret, "attach maps security programs")) + goto end_destroy; + + for (i = 0; i < sizeof(maps)/sizeof(struct bpf_map *); i++) { + for (key = 0; key < 2; key++) { + int ret = bpf_map__update_elem(maps[i], &key, + sizeof(key), &key, sizeof(key), + 0); + if (!ASSERT_OK(ret, "set initial map value")) + goto end_destroy; + } + } + + key = 0; + value = 1; + ret = bpf_map__update_elem(skel->maps.prot_status_map, &key, + sizeof(key), &value, sizeof(value), 0); + if (!ASSERT_OK(ret, "configure map protection")) + goto end_destroy; + + if (!ASSERT_OK(mkdir(BPFFS_PIN_DIR, S_IFDIR), "create bpffs pin dir")) + goto end_destroy; + + return skel; +end_destroy: + security_bpf_map__destroy(skel); +end: + return NULL; +} + +static void general_cleanup(struct security_bpf_map *skel) +{ + rmdir(BPFFS_PIN_DIR); + security_bpf_map__destroy(skel); +} + +static void update_test_desc(struct security_bpf_map *skel, + struct test_desc *test) +{ + /* Now that the skeleton is loaded, update all missing fields to + * have the subtest properly configured + */ + if (test->protection == PROTECTED) { + test->map = skel->maps.prot_map; + test->map_name = PROTECTED_MAP_NAME; + } else { + test->map = skel->maps.not_prot_map; + test->map_name = UNPROTECTED_MAP_NAME; + } +} + +static int test_setup(struct security_bpf_map *skel, struct test_desc *desc) +{ + int ret; + + update_test_desc(skel, desc); + + if (desc->pinned) { + ret = snprintf(desc->pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + desc->name); + if (!ASSERT_GT(ret, 0, "format pin path")) + return 1; + ret = bpf_map__pin(desc->map, desc->pin_path); + if (!ASSERT_OK(ret, "pin map")) + return 1; + } + + return 0; +} + +static void test_cleanup(struct test_desc *desc) +{ + if (desc->pinned) + bpf_map__unpin(desc->map, NULL); +} + +static int lookup_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map lookup %s key 0 0 0 0", + map_handle); + if (!ASSERT_GT(ret, 0, "format map lookup cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int read_map_btf_data(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "btf dump map %s", + map_handle); + if (!ASSERT_GT(ret, 0, "format map btf dump cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int write_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map update %s key 0 0 0 0 value 1 1 1 1", map_handle); + if (!ASSERT_GT(ret, 0, "format value write cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int delete_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map delete %s key 0 0 0 0", map_handle); + if (!ASSERT_GT(ret, 0, "format value deletion cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int iterate_on_map_values(char *map_handle, char *iter_pin_path) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "iter pin %s %s map %s", + BPF_ITER_FILE, iter_pin_path, map_handle); + if (!ASSERT_GT(ret, 0, "format iterator creation cmd")) + return 1; + ret = run_bpftool_command(cmd); + if (ret) + return ret; + ret = snprintf(cmd, MAP_NAME_MAX_LEN, "cat %s", iter_pin_path); + if (ret < 0) + goto cleanup; + ret = system(cmd); + +cleanup: + unlink(iter_pin_path); + return ret; +} + +static int create_inner_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type array key 4 value 4 entries 4 name %s", + BPFFS_PIN_DIR, INNER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format inner map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int create_outer_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type hash_of_maps key 4 value 4 entries 2 name %s inner_map name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, OUTER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void delete_pinned_map(char *map_name) +{ + char pin_path[PATH_MAX_LEN]; + int ret; + + ret = snprintf(pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + map_name); + if (ret >= 0) + unlink(pin_path); +} + +static int add_outer_map_entry(int key) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map update pinned %s/%s key %d 0 0 0 value name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, key, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map value addition cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void test_basic_access(struct test_desc *desc) +{ + char map_handle[MAP_NAME_MAX_LEN]; + char iter_pin_path[PATH_MAX_LEN]; + int ret; + + if (desc->pinned) + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "pinned %s", + desc->pin_path); + else + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "name %s", + desc->map_name); + if (!ASSERT_GT(ret, 0, "format map handle")) + return; + + ret = lookup_map_value(map_handle); + ASSERT_OK(ret, "read map value"); + + ret = read_map_btf_data(map_handle); + ASSERT_OK(ret, "read map btf data"); + + ret = write_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "write map value"); + + ret = delete_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "delete map value"); + /* Restore deleted value */ + if (!ret) + write_map_value(map_handle); + + ret = snprintf(iter_pin_path, PATH_MAX_LEN, "%s/iter", BPFFS_PIN_DIR); + if (ASSERT_GT(ret, 0, "format iter pin path")) { + ret = iterate_on_map_values(map_handle, iter_pin_path); + ASSERT_OK(ret, "iterate on map values"); + } +} + +static void test_create_nested_maps(void) +{ + if (!ASSERT_OK(create_inner_map(), "create inner map")) + return; + if (!ASSERT_OK(create_outer_map(), "create outer map")) + goto end_cleanup_inner; + ASSERT_OK(add_outer_map_entry(0), "add a first entry in outer map"); + ASSERT_OK(add_outer_map_entry(1), "add a second entry in outer map"); + ASSERT_NEQ(add_outer_map_entry(2), 0, "add a third entry in outer map"); + + delete_pinned_map(OUTER_MAP_NAME); +end_cleanup_inner: + delete_pinned_map(INNER_MAP_NAME); +} + +static void test_btf_list(void) +{ + ASSERT_OK(run_bpftool_command("btf list"), "list btf data"); +} + +static struct test_desc tests[] = { + { + .name = "unprotected_unpinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = false, + }, + { + .name = "unprotected_pinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = false, + }, + { + .name = "protected_unpinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = true, + }, + { + .name = "protected_pinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = true, + } +}; + +static const size_t tests_count = ARRAY_SIZE(tests); + +void test_bpftool_maps_access(void) +{ + struct security_bpf_map *skel; + struct test_desc *current; + int i; + + skel = general_setup(); + if (!ASSERT_OK_PTR(skel, "prepare programs")) + goto cleanup; + + for (i = 0; i < tests_count; i++) { + current = &tests[i]; + if (!test__start_subtest(current->name)) + continue; + if (ASSERT_OK(test_setup(skel, current), "subtest setup")) { + test_basic_access(current); + test_cleanup(current); + } + } + if (test__start_subtest("nested_maps")) + test_create_nested_maps(); + if (test__start_subtest("btf_list")) + test_btf_list(); + +cleanup: + general_cleanup(skel); +} + diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh b/tools/testing/selftests/bpf/test_bpftool_map.sh deleted file mode 100755 index 515b1df0501e..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_map.sh +++ /dev/null @@ -1,398 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -TESTNAME="bpftool_map" -BPF_FILE="security_bpf_map.bpf.o" -BPF_ITER_FILE="bpf_iter_map_elem.bpf.o" -PROTECTED_MAP_NAME="prot_map" -NOT_PROTECTED_MAP_NAME="not_prot_map" -BPF_FS_TMP_PARENT="/tmp" -BPF_FS_PARENT=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_FS_PARENT=${BPF_FS_PARENT:-$BPF_FS_TMP_PARENT} -# bpftool will mount bpf file system under BPF_DIR if it is not mounted -# under BPF_FS_PARENT. -BPF_DIR="$BPF_FS_PARENT/test_$TESTNAME" -SCRIPT_DIR=$(dirname $(realpath "$0")) -BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE" -BPF_ITER_FILE_PATH="$SCRIPT_DIR/$BPF_ITER_FILE" -BPFTOOL_PATH="bpftool" -# Assume the script is located under tools/testing/selftests/bpf/ -KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../) - -_cleanup() -{ - set +eu - - # If BPF_DIR is a mount point this will not remove the mount point itself. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null - - # Unmount if BPF filesystem was temporarily created. - if [ "$BPF_FS_PARENT" = "$BPF_FS_TMP_PARENT" ]; then - # A loop and recursive unmount are required as bpftool might - # create multiple mounts. For example, a bind mount of the directory - # to itself. The bind mount is created to change mount propagation - # flags on an actual mount point. - max_attempts=3 - attempt=0 - while mountpoint -q "$BPF_DIR" && [ $attempt -lt $max_attempts ]; do - umount -R "$BPF_DIR" 2>/dev/null - attempt=$((attempt+1)) - done - - # The directory still exists. Remove it now. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2>/dev/null - fi -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -check_root_privileges() { - if [ $(id -u) -ne 0 ]; then - echo "Need root privileges" - exit $ksft_skip - fi -} - -# Function to verify bpftool path. -# Parameters: -# $1: bpftool path -verify_bpftool_path() { - local bpftool_path="$1" - if ! "$bpftool_path" version > /dev/null 2>&1; then - echo "Could not run test without bpftool" - exit $ksft_skip - fi -} - -# Function to verify BTF support. -# The test requires BTF support for fmod_ret programs. -verify_btf_support() { - if [ ! -f /sys/kernel/btf/vmlinux ]; then - echo "Could not run test without BTF support" - exit $ksft_skip - fi -} - -# Function to initialize map entries with keys [0..2] and values set to 0. -# Parameters: -# $1: Map name -# $2: bpftool path -initialize_map_entries() { - local map_name="$1" - local bpftool_path="$2" - - for key in 0 1 2; do - "$bpftool_path" map update name "$map_name" key $key 0 0 0 value 0 0 0 $key - done -} - -# Test read access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -access_for_read() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - - # Test read access to the map. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Read access to $key in $map_name failed" - exit 1 - fi - - # Test read access to map's BTF data. - if ! "$bpftool_path" btf dump map "$name_cmd" "$map_name" 1>/dev/null; then - echo " Read access to $map_name for BTF data failed" - exit 1 - fi -} - -# Test write access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_write() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed but should have succeeded" - exit 1 - fi - fi -} - -# Test entry deletion for the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_deletion() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - # Test deletion by key for the map. - # Before deleting, check the key exists. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Key $key does not exist in $map_name" - exit 1 - fi - - # Delete by key. - if "$bpftool_path" map delete "$name_cmd" "$map_name" key $key 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Deletion for $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Deletion for $key in $map_name failed but should have succeeded" - exit 1 - fi - fi - - # After deleting, check the entry existence according to the expected status. - if "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - if [ "$write_should_succeed" = "true" ]; then - echo " Key $key for $map_name was not deleted but should have been deleted" - exit 1 - fi - else - if [ "$write_should_succeed" = "false" ]; then - echo "Key $key for $map_name was deleted but should have not been deleted" - exit 1 - fi - fi - - # Test creation of map's deleted entry, if deletion was successful. - # Otherwise, the entry exists. - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded after deletion attempt but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed after deletion attempt but should have succeeded" - exit 1 - fi - fi -} - -# Test map elements iterator. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: BPF_DIR -# $5: bpf iterator object file path -iterate_map_elem() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local bpf_dir="$4" - local bpf_file="$5" - local pin_path="$bpf_dir/map_iterator" - - "$bpftool_path" iter pin "$bpf_file" "$pin_path" map "$name_cmd" "$map_name" - if [ ! -f "$pin_path" ]; then - echo " Failed to pin iterator to $pin_path" - exit 1 - fi - - cat "$pin_path" 1>/dev/null - rm "$pin_path" 2>/dev/null -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key for rw -# $5: key to delete -# $6: Whether write should succeed (true/false) -# $7: BPF_DIR -# $8: bpf iterator object file path -access_map() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key_for_rw="$4" - local key_to_del="$5" - local write_should_succeed="$6" - local bpf_dir="$7" - local bpf_iter_file_path="$8" - - access_for_read "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" - access_for_write "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" \ - "$write_should_succeed" - access_for_deletion "$name_cmd" "$map_name" "$bpftool_path" "$key_to_del" \ - "$write_should_succeed" - iterate_map_elem "$name_cmd" "$map_name" "$bpftool_path" "$bpf_dir" \ - "$bpf_iter_file_path" -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Map name -# $2: bpftool path -# $3: BPF_DIR -# $4: Whether write should succeed (true/false) -# $5: bpf iterator object file path -test_map_access() { - local map_name="$1" - local bpftool_path="$2" - local bpf_dir="$3" - local pin_path="$bpf_dir/${map_name}_pinned" - local write_should_succeed="$4" - local bpf_iter_file_path="$5" - - # Test access to the map by name. - access_map "name" "$map_name" "$bpftool_path" "0 0 0 0" "1 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" - - # Pin the map to the BPF filesystem - "$bpftool_path" map pin name "$map_name" "$pin_path" - if [ ! -e "$pin_path" ]; then - echo " Failed to pin $map_name" - exit 1 - fi - - # Test access to the pinned map. - access_map "pinned" "$pin_path" "$bpftool_path" "0 0 0 0" "2 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" -} - -# Function to test map creation and map-of-maps -# Parameters: -# $1: bpftool path -# $2: BPF_DIR -test_map_creation_and_map_of_maps() { - local bpftool_path="$1" - local bpf_dir="$2" - local outer_map_name="outer_map_tt" - local inner_map_name="inner_map_tt" - - "$bpftool_path" map create "$bpf_dir/$inner_map_name" type array key 4 \ - value 4 entries 4 name "$inner_map_name" - if [ ! -f "$bpf_dir/$inner_map_name" ]; then - echo " Failed to create inner map file at $bpf_dir/$outer_map_name" - return 1 - fi - - "$bpftool_path" map create "$bpf_dir/$outer_map_name" type hash_of_maps \ - key 4 value 4 entries 2 name "$outer_map_name" inner_map name "$inner_map_name" - if [ ! -f "$bpf_dir/$outer_map_name" ]; then - echo " Failed to create outer map file at $bpf_dir/$outer_map_name" - return 1 - fi - - # Add entries to the outer map by name and by pinned path. - "$bpftool_path" map update pinned "$bpf_dir/$outer_map_name" key 0 0 0 0 \ - value pinned "$bpf_dir/$inner_map_name" - "$bpftool_path" map update name "$outer_map_name" key 1 0 0 0 value \ - name "$inner_map_name" - - # The outer map should be full by now. - # The following map update command is expected to fail. - if "$bpftool_path" map update name "$outer_map_name" key 2 0 0 0 value name \ - "$inner_map_name" 2>/dev/null; then - echo " Update for $outer_map_name succeeded but should have failed" - exit 1 - fi -} - -# Function to test map access with the btf list command -# Parameters: -# $1: bpftool path -test_map_access_with_btf_list() { - local bpftool_path="$1" - - # The btf list command iterates over maps for - # loaded BPF programs. - if ! "$bpftool_path" btf list 1>/dev/null; then - echo " Failed to access btf data" - exit 1 - fi -} - -set -eu - -trap cleanup_skip EXIT - -check_root_privileges - -verify_bpftool_path "$BPFTOOL_PATH" - -verify_btf_support - -trap cleanup EXIT - -# Load and attach the BPF programs to control maps access. -"$BPFTOOL_PATH" prog loadall "$BPF_FILE_PATH" "$BPF_DIR" autoattach - -initialize_map_entries "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" -initialize_map_entries "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" - -# Activate the map protection mechanism. Protection status is controlled -# by a value stored in the prot_status_map at index 0. -"$BPFTOOL_PATH" map update name prot_status_map key 0 0 0 0 value 1 0 0 0 - -# Test protected map (write should fail). -test_map_access "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "false" \ - "$BPF_ITER_FILE_PATH" - -# Test not protected map (write should succeed). -test_map_access "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "true" \ - "$BPF_ITER_FILE_PATH" - -test_map_creation_and_map_of_maps "$BPFTOOL_PATH" "$BPF_DIR" - -test_map_access_with_btf_list "$BPFTOOL_PATH" - -exit 0 -- cgit v1.2.3 From 08e8f1ef3df270daef4ffc9c4bb15669f72d5d2f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 14 Jan 2026 22:47:56 -0800 Subject: kernel-chktaint: add reporting for tainted modules Check all loaded modules and report any that have their 'taint' flags set. The tainted module output format is: * () Example output: Kernel is "tainted" for the following reasons: * externally-built ('out-of-tree') module was loaded (#12) * unsigned module was loaded (#13) Raw taint value as int/string: 12288/'G OE ' Tainted modules: * dump_test (OE) Link: https://lkml.kernel.org/r/20260115064756.531592-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Thorsten Leemhuis Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- tools/debugging/kernel-chktaint | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index e7da0909d097..e1571c04afb5 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -211,9 +211,25 @@ else addout "J" echo " * fwctl's mutating debug interface was used (#19)" fi +echo "Raw taint value as int/string: $taint/'$out'" + +# report on any tainted loadable modules +[ "$1" = "" ] && [ -r /sys/module/ ] && \ + cnt=`grep [A-Z] /sys/module/*/taint | wc -l` || cnt=0 +if [ $cnt -ne 0 ]; then + echo + echo "Tainted modules:" + for dir in `ls /sys/module` ; do + if [ -r /sys/module/$dir/taint ]; then + modtnt=`cat /sys/module/$dir/taint` + [ "$modtnt" = "" ] || echo " * $dir ($modtnt)" + fi + done +fi + +echo echo "For a more detailed explanation of the various taint flags see" echo " Documentation/admin-guide/tainted-kernels.rst in the Linux kernel sources" echo " or https://kernel.org/doc/html/latest/admin-guide/tainted-kernels.html" -echo "Raw taint value as int/string: $taint/'$out'" #EOF# -- cgit v1.2.3 From a84a1fe0fb2e9bfccb1d5a2929a249960a93264d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 25 Jan 2026 12:55:24 +0200 Subject: selftests: net: fix wrong boolean evaluation in __exit__ The __exit__ method receives ex_type as the exception class when an exception occurs. The previous code used implicit boolean evaluation: terminate = self.terminate or (self._exit_wait and ex_type) ^^^^^^^^^^^ In Python, the and operator can be used with non-boolean values, but it does not always return a boolean result. This is probably not what we want, because 'self._exit_wait and ex_type' could return the actual ex_type value (the exception class) rather than a boolean True when an exception occurs. Use explicit `ex_type is not None` check to properly evaluate whether an exception occurred, returning a boolean result. Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20260125105524.773993-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 37243103aee3..85884f3e827b 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -160,7 +160,7 @@ class bkg(cmd): def __exit__(self, ex_type, ex_value, ex_tb): # Force termination on exception - terminate = self.terminate or (self._exit_wait and ex_type) + terminate = self.terminate or (self._exit_wait and ex_type is not None) return self.process(terminate=terminate, fail=self.check_fail) -- cgit v1.2.3 From 53eb797ffc3abe30418b19777922b55fb339fc1f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:41 +0000 Subject: mm/rmap: remove anon_vma_merge() function This function is confusing, we already have the concept of anon_vma merge to adjacent VMA's anon_vma's to increase probability of anon_vma compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.), as well as anon_vma reuse, along side the usual VMA merge logic. We can remove the anon_vma check as it is redundant - a merge would not have been permitted with removal if the anon_vma's were not the same (and in the case of an unfaulted/faulted merge, we would have already set the unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()). Avoid overloading this term when we're very simply unlinking anon_vma state from a removed VMA upon merge. Link: https://lkml.kernel.org/r/56bbe45e309f7af197b1c4f94a9a0c8931ff2d29.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ------- mm/vma.c | 2 +- tools/testing/vma/vma_internal.h | 5 ----- 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'tools') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..832bfc0ccfc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma) return __anon_vma_prepare(vma); } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); - unlink_anon_vmas(next); -} - struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID diff --git a/mm/vma.c b/mm/vma.c index f81a5cfcd7cc..6c458c8656b8 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -381,7 +381,7 @@ again: fput(vp->file); } if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); + unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 9f0a9f5ed0fe..93e5792306d9 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ -} - static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From d17f02417a337de0a0c6e763e938ee5e41a97c3d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:45 +0000 Subject: mm/rmap: separate out fork-only logic on anon_vma_clone() Specify which operation is being performed to anon_vma_clone(), which allows us to do checks specific to each operation type, as well as to separate out and make clear that the anon_vma reuse logic is absolutely specific to fork only. This opens the door to further refactorings and refinements later as we have more information to work with. Link: https://lkml.kernel.org/r/cf7da7a2d973cdc72a1b80dd9a73260519e8fa9f.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 11 +++++- mm/rmap.c | 74 +++++++++++++++++++++++++++------------- mm/vma.c | 6 ++-- tools/testing/vma/vma_internal.h | 11 +++++- 4 files changed, 74 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/mm/internal.h b/mm/internal.h index aac4ec53fe15..5585059f0209 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) struct anon_vma *folio_get_anon_vma(const struct folio *folio); -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation); int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); int __anon_vma_prepare(struct vm_area_struct *vma); void unlink_anon_vmas(struct vm_area_struct *vma); diff --git a/mm/rmap.c b/mm/rmap.c index a5ce9163454a..6ddbf58111ff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma) } static void check_anon_vma_clone(struct vm_area_struct *dst, - struct vm_area_struct *src) + struct vm_area_struct *src, + enum vma_operation operation) { /* The write lock must be held. */ mmap_assert_write_locked(src->vm_mm); - /* If not a fork (implied by dst->anon_vma) then must be on same mm. */ - VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm); + /* If not a fork then must be on same mm. */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm); /* If we have anything to do src->anon_vma must be provided. */ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); @@ -249,6 +250,40 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, * must be the same across dst and src. */ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); + /* + * Essentially equivalent to above - if not a no-op, we should expect + * dst->anon_vma to be set for everything except a fork. + */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma && + !dst->anon_vma); + /* For the anon_vma to be compatible, it can only be singular. */ + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED && + !list_is_singular(&src->anon_vma_chain)); +#ifdef CONFIG_PER_VMA_LOCK + /* Only merging an unfaulted VMA leaves the destination attached. */ + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED && + vma_is_attached(dst)); +#endif +} + +static void maybe_reuse_anon_vma(struct vm_area_struct *dst, + struct anon_vma *anon_vma) +{ + /* If already populated, nothing to do.*/ + if (dst->anon_vma) + return; + + /* + * We reuse an anon_vma if any linking VMAs were unmapped and it has + * only a single child at most. + */ + if (anon_vma->num_active_vmas > 0) + return; + if (anon_vma->num_children > 1) + return; + + dst->anon_vma = anon_vma; + anon_vma->num_active_vmas++; } static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); @@ -258,6 +293,7 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); * all of the anon_vma objects contained within @src anon_vma_chain's. * @dst: The destination VMA with an empty anon_vma_chain. * @src: The source VMA we wish to duplicate. + * @operation: The type of operation which resulted in the clone. * * This is the heart of the VMA side of the anon_vma implementation - we invoke * this function whenever we need to set up a new VMA's anon_vma state. @@ -280,17 +316,17 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); * * Returns: 0 on success, -ENOMEM on failure. */ -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { struct anon_vma_chain *avc, *pavc; + struct anon_vma *active_anon_vma = src->anon_vma; - check_anon_vma_clone(dst, src); + check_anon_vma_clone(dst, src, operation); - if (!src->anon_vma) + if (!active_anon_vma) return 0; - check_anon_vma_clone(dst, src); - /* * Allocate AVCs. We don't need an anon_vma lock for this as we * are not updating the anon_vma rbtree nor are we changing @@ -318,22 +354,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) struct anon_vma *anon_vma = avc->anon_vma; anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); - - /* - * Reuse existing anon_vma if it has no vma and only one - * anon_vma child. - * - * Root anon_vma is never reused: - * it has self-parent reference and at least one child. - */ - if (!dst->anon_vma && src->anon_vma && - anon_vma->num_children < 2 && - anon_vma->num_active_vmas == 0) - dst->anon_vma = anon_vma; + if (operation == VMA_OP_FORK) + maybe_reuse_anon_vma(dst, anon_vma); } - if (dst->anon_vma) + + if (operation != VMA_OP_FORK) dst->anon_vma->num_active_vmas++; - anon_vma_unlock_write(src->anon_vma); + + anon_vma_unlock_write(active_anon_vma); return 0; enomem_failure: @@ -372,7 +400,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - rc = anon_vma_clone(vma, pvma); + rc = anon_vma_clone(vma, pvma, VMA_OP_FORK); /* An error arose or an existing anon_vma was reused, all done then. */ if (rc || vma->anon_vma) { put_anon_vma(anon_vma); diff --git a/mm/vma.c b/mm/vma.c index 6c458c8656b8..3dbe414eff89 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -530,7 +530,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_vmi; - err = anon_vma_clone(new, vma); + err = anon_vma_clone(new, vma, VMA_OP_SPLIT); if (err) goto out_free_mpol; @@ -628,7 +628,7 @@ static int dup_anon_vma(struct vm_area_struct *dst, vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); + ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED); if (ret) return ret; @@ -1901,7 +1901,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) + if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 93e5792306d9..7fa56dcc53a6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -600,6 +600,14 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru return 0; } -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { /* For testing purposes. We indicate that an anon_vma has been cloned. */ if (src->anon_vma != NULL) { -- cgit v1.2.3 From 7ce6dfc603ed01044ebe58472a584d9995281ca2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 26 Jan 2026 14:05:48 -0800 Subject: perf script: Fix script_fetch_insn for more than just x86 The script_fetch_insn code was only supported on natively running x86. Implement a crude elf_machine_max_instruction_length function and use to give an instruction length on more than just x86. Use the ELF machine to determine the length to use to support cross-architecture development. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shimin Guo Cc: Yujie Liu [ Conditionally define EM_CSKY and EM_LOONGARCH for older distros ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/Build | 1 - tools/perf/arch/x86/util/archinsn.c | 27 -------- tools/perf/builtin-script.c | 16 +---- .../perf/scripts/python/Perf-Trace-Util/Context.c | 2 +- tools/perf/tests/dlfilter-test.c | 1 - tools/perf/util/archinsn.h | 12 ---- tools/perf/util/dlfilter.c | 3 +- tools/perf/util/sample.c | 77 ++++++++++++++++++++++ tools/perf/util/sample.h | 7 ++ tools/perf/util/trace-event-scripting.c | 16 ----- tools/perf/util/trace-event.h | 3 - 11 files changed, 87 insertions(+), 78 deletions(-) delete mode 100644 tools/perf/arch/x86/util/archinsn.c delete mode 100644 tools/perf/util/archinsn.h (limited to 'tools') diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index fad256252bb9..76127eefde8b 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -14,6 +14,5 @@ perf-util-y += iostat.o perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o perf-util-y += auxtrace.o -perf-util-y += archinsn.o perf-util-y += intel-pt.o perf-util-y += intel-bts.o diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c deleted file mode 100644 index 546feda08428..000000000000 --- a/tools/perf/arch/x86/util/archinsn.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "archinsn.h" -#include "event.h" -#include "machine.h" -#include "thread.h" -#include "symbol.h" -#include "../../../../arch/x86/include/asm/insn.h" - -void arch_fetch_insn(struct perf_sample *sample, - struct thread *thread, - struct machine *machine) -{ - struct insn insn; - int len, ret; - bool is64bit = false; - - if (!sample->ip) - return; - len = thread__memcpy(thread, machine, sample->insn, sample->ip, sizeof(sample->insn), &is64bit); - if (len <= 0) - return; - - ret = insn_decode(&insn, sample->insn, len, - is64bit ? INSN_MODE_64 : INSN_MODE_32); - if (ret >= 0 && insn.length <= len) - sample->insn_len = insn.length; -} diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 6ec225c697a4..69af25780fc5 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -37,7 +37,6 @@ #include "ui/ui.h" #include "print_binary.h" #include "print_insn.h" -#include "archinsn.h" #include #include #include @@ -90,7 +89,6 @@ static bool print_flags; static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); static int max_blocks; -static bool native_arch; static struct dlfilter *dlfilter; static int dlargc; static char **dlargv; @@ -1627,7 +1625,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample, { int printed = 0; - script_fetch_insn(sample, thread, machine, native_arch); + perf_sample__fetch_insn(sample, thread, machine); if (PRINT_FIELD(INSNLEN)) printed += fprintf(fp, " ilen: %d", sample->insn_len); @@ -4034,7 +4032,6 @@ int cmd_script(int argc, const char **argv) .set = false, .default_no_sample = true, }; - struct utsname uts; char *script_path = NULL; const char *dlfilter_file = NULL; const char **__argv; @@ -4456,17 +4453,6 @@ script_found: if (symbol__init(env) < 0) goto out_delete; - uname(&uts); - if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */ - native_arch = true; - } else if (env->arch) { - if (!strcmp(uts.machine, env->arch)) - native_arch = true; - else if (!strcmp(uts.machine, "x86_64") && - !strcmp(env->arch, "i386")) - native_arch = true; - } - script.session = session; script__setup_sample_type(&script); diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c index 60dcfe56d4d9..c19f44610983 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c +++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c @@ -93,7 +93,7 @@ static PyObject *perf_sample_insn(PyObject *obj, PyObject *args) if (c->sample->ip && !c->sample->insn_len && thread__maps(c->al->thread)) { struct machine *machine = maps__machine(thread__maps(c->al->thread)); - script_fetch_insn(c->sample, c->al->thread, machine, /*native_arch=*/true); + perf_sample__fetch_insn(c->sample, c->al->thread, machine); } if (!c->sample->insn_len) Py_RETURN_NONE; /* N.B. This is a return statement */ diff --git a/tools/perf/tests/dlfilter-test.c b/tools/perf/tests/dlfilter-test.c index 80a1c941138d..e63790c61d53 100644 --- a/tools/perf/tests/dlfilter-test.c +++ b/tools/perf/tests/dlfilter-test.c @@ -30,7 +30,6 @@ #include "symbol.h" #include "synthetic-events.h" #include "util.h" -#include "archinsn.h" #include "dlfilter.h" #include "tests.h" #include "util/sample.h" diff --git a/tools/perf/util/archinsn.h b/tools/perf/util/archinsn.h deleted file mode 100644 index 448cbb6b8d7e..000000000000 --- a/tools/perf/util/archinsn.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef INSN_H -#define INSN_H 1 - -struct perf_sample; -struct machine; -struct thread; - -void arch_fetch_insn(struct perf_sample *sample, - struct thread *thread, - struct machine *machine); - -#endif diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c index c0afcbd954f8..dc31b5e7149e 100644 --- a/tools/perf/util/dlfilter.c +++ b/tools/perf/util/dlfilter.c @@ -234,8 +234,7 @@ static const __u8 *dlfilter__insn(void *ctx, __u32 *len) struct machine *machine = maps__machine(thread__maps(al->thread)); if (machine) - script_fetch_insn(d->sample, al->thread, machine, - /*native_arch=*/true); + perf_sample__fetch_insn(d->sample, al->thread, machine); } } diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c index 605fee971f55..6d70a5db00a2 100644 --- a/tools/perf/util/sample.c +++ b/tools/perf/util/sample.c @@ -1,9 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include "sample.h" #include "debug.h" +#include "thread.h" +#include +#ifndef EM_CSKY +#define EM_CSKY 252 +#endif +#ifndef EM_LOONGARCH +#define EM_LOONGARCH 258 +#endif #include #include #include +#include "../../arch/x86/include/asm/insn.h" void perf_sample__init(struct perf_sample *sample, bool all) { @@ -41,3 +50,71 @@ struct regs_dump *perf_sample__intr_regs(struct perf_sample *sample) } return sample->intr_regs; } + +static int elf_machine_max_instruction_length(uint16_t e_machine) +{ + switch (e_machine) { + /* Fixed 4-byte (32-bit) architectures */ + case EM_AARCH64: + case EM_PPC: + case EM_PPC64: + case EM_MIPS: + case EM_SPARC: + case EM_SPARCV9: + case EM_ALPHA: + case EM_LOONGARCH: + case EM_PARISC: + case EM_SH: + return 4; + + /* Variable length or mixed-mode architectures */ + case EM_ARM: /* Variable due to Thumb/Thumb-2 */ + case EM_RISCV: /* Variable due to Compressed (C) extension */ + case EM_CSKY: /* Variable (16 or 32 bit) */ + case EM_ARC: /* Variable (ARCompact) */ + return 4; + case EM_S390: /* Variable (2, 4, or 6 bytes) */ + return 6; + case EM_68K: + return 10; + case EM_386: + case EM_X86_64: + return 15; + case EM_XTENSA: /* Variable (FLIX) */ + return 16; + default: + return MAX_INSN; + } +} + +void perf_sample__fetch_insn(struct perf_sample *sample, + struct thread *thread, + struct machine *machine) +{ + int ret, len; + bool is64bit = false; + uint16_t e_machine; + + if (!sample->ip || sample->insn_len != 0) + return; + + e_machine = thread__e_machine(thread, machine); + len = elf_machine_max_instruction_length(e_machine); + len = thread__memcpy(thread, machine, sample->insn, + sample->ip, len, + &is64bit); + if (len <= 0) + return; + + sample->insn_len = len; + + if (e_machine == EM_386 || e_machine == EM_X86_64) { + /* Refine the x86 instruction length with the decoder. */ + struct insn insn; + + ret = insn_decode(&insn, sample->insn, len, + is64bit ? INSN_MODE_64 : INSN_MODE_32); + if (ret >= 0 && insn.length <= len) + sample->insn_len = insn.length; + } +} diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h index a8307b20a9ea..3cce8dd202aa 100644 --- a/tools/perf/util/sample.h +++ b/tools/perf/util/sample.h @@ -5,6 +5,9 @@ #include #include +struct machine; +struct thread; + /* number of register is bound by the number of bits in regs_dump::mask (64) */ #define PERF_SAMPLE_REGS_CACHE_SIZE (8 * sizeof(u64)) @@ -127,6 +130,10 @@ void perf_sample__exit(struct perf_sample *sample); struct regs_dump *perf_sample__user_regs(struct perf_sample *sample); struct regs_dump *perf_sample__intr_regs(struct perf_sample *sample); +void perf_sample__fetch_insn(struct perf_sample *sample, + struct thread *thread, + struct machine *machine); + /* * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get * 8-byte alignment. diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c index 72abb28b7b5a..fa850e44cb46 100644 --- a/tools/perf/util/trace-event-scripting.c +++ b/tools/perf/util/trace-event-scripting.c @@ -13,7 +13,6 @@ #include #endif -#include "archinsn.h" #include "debug.h" #include "event.h" #include "trace-event.h" @@ -274,21 +273,6 @@ void setup_perl_scripting(void) #endif #endif -#if !defined(__i386__) && !defined(__x86_64__) -void arch_fetch_insn(struct perf_sample *sample __maybe_unused, - struct thread *thread __maybe_unused, - struct machine *machine __maybe_unused) -{ -} -#endif - -void script_fetch_insn(struct perf_sample *sample, struct thread *thread, - struct machine *machine, bool native_arch) -{ - if (sample->insn_len == 0 && native_arch) - arch_fetch_insn(sample, thread, machine); -} - static const struct { u32 flags; const char *name; diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index 71e680bc3d4b..914d9b69ed62 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -116,9 +116,6 @@ extern unsigned int scripting_max_stack; struct scripting_ops *script_spec__lookup(const char *spec); int script_spec__for_each(int (*cb)(struct scripting_ops *ops, const char *spec)); -void script_fetch_insn(struct perf_sample *sample, struct thread *thread, - struct machine *machine, bool native_arch); - void setup_perl_scripting(void); void setup_python_scripting(void); -- cgit v1.2.3 From f33e7aa42ea79f2142f073df777c01125def45e5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 26 Jan 2026 14:05:49 -0800 Subject: perf callchain: Switch callchain_param_setup from an arch to an e_machine Increase use of e_machine by replacing callchain_param_setup's arch argument to be an e_machine typically read from the session. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shimin Guo Cc: Yujie Liu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 6 ++---- tools/perf/builtin-script.c | 4 ++-- tools/perf/util/callchain.c | 4 ++-- tools/perf/util/callchain.h | 2 +- tools/perf/util/sample.c | 2 +- 5 files changed, 8 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2e936928e8c0..810ffd66b11c 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -448,7 +448,7 @@ static int report__setup_sample_type(struct report *rep) } } - callchain_param_setup(sample_type, perf_env__arch(perf_session__env(rep->session))); + callchain_param_setup(sample_type, perf_session__e_machine(session)); if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) { ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" @@ -1283,7 +1283,6 @@ static int process_attr(const struct perf_tool *tool __maybe_unused, struct evlist **pevlist) { struct perf_session *session; - struct perf_env *env; u64 sample_type; int err; @@ -1297,8 +1296,7 @@ static int process_attr(const struct perf_tool *tool __maybe_unused, */ sample_type = evlist__combined_sample_type(*pevlist); session = (*pevlist)->session; - env = perf_session__env(session); - callchain_param_setup(sample_type, perf_env__arch(env)); + callchain_param_setup(sample_type, perf_session__e_machine(session)); return 0; } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 69af25780fc5..c7d5a325b5cb 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2859,7 +2859,7 @@ static int process_attr(const struct perf_tool *tool, union perf_event *event, * on events sample_type. */ sample_type = evlist__combined_sample_type(evlist); - callchain_param_setup(sample_type, perf_env__arch(perf_session__env(scr->session))); + callchain_param_setup(sample_type, perf_session__e_machine(evsel__session(evsel))); /* Enable fields for callchain entries */ if (symbol_conf.use_callchain && @@ -3834,7 +3834,7 @@ static void script__setup_sample_type(struct perf_script *script) struct perf_session *session = script->session; u64 sample_type = evlist__combined_sample_type(session->evlist); - callchain_param_setup(sample_type, perf_env__arch(session->machines.host.env)); + callchain_param_setup(sample_type, perf_session__e_machine(session)); if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) { pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 515bb8b5da01..8ff0898799ee 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1681,7 +1681,7 @@ void callchain_cursor_reset(struct callchain_cursor *cursor) map_symbol__exit(&node->ms); } -void callchain_param_setup(u64 sample_type, const char *arch) +void callchain_param_setup(u64 sample_type, uint16_t e_machine) { if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) { if ((sample_type & PERF_SAMPLE_REGS_USER) && @@ -1703,7 +1703,7 @@ void callchain_param_setup(u64 sample_type, const char *arch) * erroneous entries. Always skipping the LR and starting from the FP * can result in missing entries. */ - if (callchain_param.record_mode == CALLCHAIN_FP && !strcmp(arch, "arm64")) + if (callchain_param.record_mode == CALLCHAIN_FP && e_machine == EM_AARCH64) dwarf_callchain_users = true; } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 2a52af8c80ac..df54ddb8c0cb 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -303,7 +303,7 @@ int callchain_branch_counts(struct callchain_root *root, u64 *branch_count, u64 *predicted_count, u64 *abort_count, u64 *cycles_count); -void callchain_param_setup(u64 sample_type, const char *arch); +void callchain_param_setup(u64 sample_type, uint16_t e_machine); bool callchain_cnode_matched(struct callchain_node *base_cnode, struct callchain_node *pair_cnode); diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c index 6d70a5db00a2..8f82aaf1aab6 100644 --- a/tools/perf/util/sample.c +++ b/tools/perf/util/sample.c @@ -98,7 +98,7 @@ void perf_sample__fetch_insn(struct perf_sample *sample, if (!sample->ip || sample->insn_len != 0) return; - e_machine = thread__e_machine(thread, machine); + e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL); len = elf_machine_max_instruction_length(e_machine); len = thread__memcpy(thread, machine, sample->insn, sample->ip, len, -- cgit v1.2.3 From 4b870f62c5079b48a6a19c852f4db5d2569a5239 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 26 Jan 2026 14:05:50 -0800 Subject: perf thread-stack: Switch thread_stack__init() to use e_machine The architecture type is used to set the retpoline state. Rather than use the arch string switch to using the ELF machine that's readily available within the thread. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shimin Guo Cc: Yujie Liu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread-stack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index c6a0a27b12c2..c5ce741b0744 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -157,10 +157,10 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread, if (thread__maps(thread) && maps__machine(thread__maps(thread))) { struct machine *machine = maps__machine(thread__maps(thread)); - const char *arch = perf_env__arch(machine->env); + uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL); ts->kernel_start = machine__kernel_start(machine); - if (!strcmp(arch, "x86")) + if (e_machine == EM_X86_64 || e_machine == EM_386) ts->rstate = X86_RETPOLINE_POSSIBLE; } else { ts->kernel_start = 1ULL << 63; -- cgit v1.2.3 From 23262369e650c9995505eb4f69f16449467e6bfe Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf disasm: Constify variables storing the result of bsearch() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 6b36287f30fe..ddcc488f2e5f 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -761,7 +761,7 @@ static void ins__sort(struct arch *arch) static const struct ins_ops *__ins__find(const struct arch *arch, const char *name, struct disasm_line *dl) { - struct ins *ins; + const struct ins *ins; const int nmemb = arch->nr_instructions; if (arch__is_powerpc(arch)) { -- cgit v1.2.3 From b42868624c7d00206f77d19a6fbfea73a44ff6f2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf metricgroup: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 25c75fdbfc52..40a1e14de418 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -367,7 +367,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, static bool match_metric_or_groups(const char *metric_or_groups, const char *sought) { int len; - char *m; + const char *m; if (!sought) return false; @@ -450,11 +450,10 @@ static const char *code_characters = ",-=@"; static int encode_metric_id(struct strbuf *sb, const char *x) { - char *c; int ret = 0; for (; *x; x++) { - c = strchr(code_characters, *x); + const char *c = strchr(code_characters, *x); if (c) { ret = strbuf_addch(sb, '!'); if (ret) -- cgit v1.2.3 From 678ed6b707e4b2db250f255d2f959322896dae65 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 02:03:01 -0300 Subject: perf strlist: Don't write to const memory Do a strdup to the list string and parse from it, free at the end. This is to deal with newer glibcs const-correctness. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/strlist.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index 8a868cbeffae..98883672fcf4 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -139,21 +139,25 @@ out: return err; } -static int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir) +static int strlist__parse_list(struct strlist *slist, const char *list, const char *subst_dir) { - char *sep; + char *sep, *s = strdup(list), *sdup = s; int err; + if (s == NULL) + return -ENOMEM; + while ((sep = strchr(s, ',')) != NULL) { *sep = '\0'; err = strlist__parse_list_entry(slist, s, subst_dir); - *sep = ','; if (err != 0) return err; s = sep + 1; } - return *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0; + err = *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0; + free(sdup); + return err; } struct strlist *strlist__new(const char *list, const struct strlist_config *config) -- cgit v1.2.3 From f1321cce848c558fde4c0c6bcd5e53f3cefd3af2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 02:09:37 -0300 Subject: perf session: Don't write to memory pointed to a const pointer Since it is freshly allocated just attribute it to a non-const pointer and then change it via that pointer. That way we avoid const-correctness warnings in recent glibc versions. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ae62d5c9889f..d0053618f540 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2676,7 +2676,7 @@ bool perf_session__has_switch_events(struct perf_session *session) int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr) { - char *bracket; + char *bracket, *name; struct ref_reloc_sym *ref; struct kmap *kmap; @@ -2684,13 +2684,13 @@ int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u6 if (ref == NULL) return -ENOMEM; - ref->name = strdup(symbol_name); + ref->name = name = strdup(symbol_name); if (ref->name == NULL) { free(ref); return -ENOMEM; } - bracket = strchr(ref->name, ']'); + bracket = strchr(name, ']'); if (bracket) *bracket = '\0'; -- cgit v1.2.3 From 0e47251e8cc438d5b59fcd86d27efade01976fe1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 02:15:30 -0300 Subject: perf hwmon_pmu: Constify the variables returning bsearch() on const tables To address const-correctness errors on newer glibcs (-Werror=discarded-qualifiers). Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hwmon_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/hwmon_pmu.c b/tools/perf/util/hwmon_pmu.c index 279d6b1a47f0..fb3ffa8d32ad 100644 --- a/tools/perf/util/hwmon_pmu.c +++ b/tools/perf/util/hwmon_pmu.c @@ -161,7 +161,7 @@ bool parse_hwmon_filename(const char *filename, bool *alarm) { char fn_type[24]; - const char **elem; + const char * const *elem; const char *fn_item = NULL; size_t fn_item_len; -- cgit v1.2.3 From 0341eab66ba03a1f439db91f03bccd5b0a360842 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 02:22:00 -0300 Subject: perf tp_pmu: Address const-correctness errors in recent glibcs To avoid having more variables, just cast the const variable searched to non-const since the result will not be modified, its only later that that variable will be used to modify something, but then its non-const memory being modified, so using a cast is the cheapest thing here. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/tp_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/tp_pmu.c b/tools/perf/util/tp_pmu.c index eddb9807131a..c2be8c9f9084 100644 --- a/tools/perf/util/tp_pmu.c +++ b/tools/perf/util/tp_pmu.c @@ -192,7 +192,7 @@ bool tp_pmu__have_event(struct perf_pmu *pmu __maybe_unused, const char *name) char *dup_name, *colon; int id; - colon = strchr(name, ':'); + colon = strchr((char *)name, ':'); if (colon == NULL) return false; -- cgit v1.2.3 From 97b81df7225830c4db3c17ed1235d2f3eb613d3d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf trace-event: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/trace-event-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index c8755679281e..45774722f249 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -482,7 +482,7 @@ char *tracepoint_id_to_name(u64 config) static struct tracepoint_path *tracepoint_name_to_path(const char *name) { struct tracepoint_path *path = zalloc(sizeof(*path)); - char *str = strchr(name, ':'); + const char *str = strchr(name, ':'); if (path == NULL || str == NULL) { free(path); -- cgit v1.2.3 From 0e14cb3b24f8f301cf6490a4493afc98321ed5bb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf units: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/units.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c index 4c6a86e1cb54..0bbacf5a29aa 100644 --- a/tools/perf/util/units.c +++ b/tools/perf/util/units.c @@ -12,7 +12,7 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags) struct parse_tag *i = tags; while (i->tag) { - char *s = strchr(str, i->tag); + const char *s = strchr(str, i->tag); if (s) { unsigned long int value; -- cgit v1.2.3 From 21c0bc9144834e39762dd6fddbb255ebb80cf079 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf time-utils: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/time-utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c index 1b91ccd4d523..d43c4577d7eb 100644 --- a/tools/perf/util/time-utils.c +++ b/tools/perf/util/time-utils.c @@ -325,7 +325,7 @@ static int percent_comma_split(struct perf_time_interval *ptime_buf, int num, } static int one_percent_convert(struct perf_time_interval *ptime_buf, - const char *ostr, u64 start, u64 end, char *c) + const char *ostr, u64 start, u64 end, const char *c) { char *str; int len = strlen(ostr), ret; @@ -358,7 +358,7 @@ static int one_percent_convert(struct perf_time_interval *ptime_buf, int perf_time__percent_parse_str(struct perf_time_interval *ptime_buf, int num, const char *ostr, u64 start, u64 end) { - char *c; + const char *c; /* * ostr example: -- cgit v1.2.3 From 79bba3a1834e7ba6c437674582cc9f3ae6fb638c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf demangle-java: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/demangle-java.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c index ddf33d58bcd3..c3cb327ed562 100644 --- a/tools/perf/util/demangle-java.c +++ b/tools/perf/util/demangle-java.c @@ -158,7 +158,7 @@ char * java_demangle_sym(const char *str, int flags) { char *buf, *ptr; - char *p; + const char *p; size_t len, l1 = 0; if (!str) -- cgit v1.2.3 From 8bf093acb3f1f07d846c86e32308f9f9954ed579 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf bpf-event: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 2e6da3ad0a4f..67e7786bb878 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -733,7 +733,8 @@ kallsyms_process_symbol(void *data, const char *_name, char type __maybe_unused, u64 start) { char disp[KSYM_NAME_LEN]; - char *module, *name; + const char *module; + char *name; unsigned long id; int err = 0; -- cgit v1.2.3 From 68abacb0686651dd3f0bbce2fa94b438afeb2fc4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 01:15:47 -0300 Subject: perf jitdump: Constify variables storing the result of strchr() on const tables As newer glibcs will propagate the const attribute of the searched table to its return. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/jitdump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index d4fe35f9d9a5..e0ce8b904729 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -758,7 +758,7 @@ jit_inject(struct jit_buf_desc *jd, const char *path) static int jit_detect(const char *mmap_name, pid_t pid, struct nsinfo *nsi, bool *in_pidns) { - char *p; + const char *p; char *end = NULL; pid_t pid2; -- cgit v1.2.3 From 873e7de9f9a3b67b08b380057b2c7828b0d78cae Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:44 -0800 Subject: selftests/vsock: increase timeout to 1200 Increase the timeout from 300s to 1200s. On a modern bare metal server my last run showed the new set of tests taking ~400s. Multiply by an (arbitrary) factor of three to account for slower/nested runners. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-4-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/settings b/tools/testing/selftests/vsock/settings index 694d70710ff0..79b65bdf05db 100644 --- a/tools/testing/selftests/vsock/settings +++ b/tools/testing/selftests/vsock/settings @@ -1 +1 @@ -timeout=300 +timeout=1200 -- cgit v1.2.3 From 423ec6383edba92e78abbb99a776147b3fe7b2ca Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:45 -0800 Subject: selftests/vsock: add namespace helpers to vmtest.sh Add functions for initializing namespaces with the different vsock NS modes. Callers can use add_namespaces() and del_namespaces() to create namespaces global0, global1, local0, and local1. The add_namespaces() function initializes global0, local0, etc... with their respective vsock NS mode by toggling child_ns_mode before creating the namespace. Remove namespaces upon exiting the program in cleanup(). This is unlikely to be needed for a healthy run, but it is useful for tests that are manually killed mid-test. This patch is in preparation for later namespace tests. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-5-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c7b270dd77a9..c2bdc293b94c 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -49,6 +49,7 @@ readonly TEST_DESCS=( ) readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback) +readonly NS_MODES=("local" "global") VERBOSE=0 @@ -103,6 +104,36 @@ check_result() { fi } +add_namespaces() { + local orig_mode + orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode) + + for mode in "${NS_MODES[@]}"; do + echo "${mode}" > /proc/sys/net/vsock/child_ns_mode + ip netns add "${mode}0" 2>/dev/null + ip netns add "${mode}1" 2>/dev/null + done + + echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode +} + +init_namespaces() { + for mode in "${NS_MODES[@]}"; do + # we need lo for qemu port forwarding + ip netns exec "${mode}0" ip link set dev lo up + ip netns exec "${mode}1" ip link set dev lo up + done +} + +del_namespaces() { + for mode in "${NS_MODES[@]}"; do + ip netns del "${mode}0" &>/dev/null + ip netns del "${mode}1" &>/dev/null + log_host "removed ns ${mode}0" + log_host "removed ns ${mode}1" + done +} + vm_ssh() { ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@" return $? @@ -110,6 +141,7 @@ vm_ssh() { cleanup() { terminate_pidfiles "${!PIDFILES[@]}" + del_namespaces } check_args() { -- cgit v1.2.3 From fd1b41725d585f29029b8d8610a155f26727c18e Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:46 -0800 Subject: selftests/vsock: prepare vm management helpers for namespaces Add namespace support to vm management, ssh helpers, and vsock_test wrapper functions. This enables running VMs and test helpers in specific namespaces, which is required for upcoming namespace isolation tests. The functions still work correctly within the init ns, though the caller must now pass "init_ns" explicitly. No functional changes for existing tests. All have been updated to pass "init_ns" explicitly. Affected functions (such as vm_start() and vm_ssh()) now wrap their commands with 'ip netns exec' when executing commands in non-init namespaces. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-6-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 101 ++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c2bdc293b94c..c4d73dd0a4cf 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -135,7 +135,18 @@ del_namespaces() { } vm_ssh() { - ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@" + local ns_exec + + if [[ "${1}" == init_ns ]]; then + ns_exec="" + else + ns_exec="ip netns exec ${1}" + fi + + shift + + ${ns_exec} ssh -q -o UserKnownHostsFile=/dev/null -p "${SSH_HOST_PORT}" localhost "$@" + return $? } @@ -258,10 +269,12 @@ terminate_pidfiles() { vm_start() { local pidfile=$1 + local ns=$2 local logfile=/dev/null local verbose_opt="" local kernel_opt="" local qemu_opts="" + local ns_exec="" local qemu qemu=$(command -v "${QEMU}") @@ -282,7 +295,11 @@ vm_start() { kernel_opt="${KERNEL_CHECKOUT}" fi - vng \ + if [[ "${ns}" != "init_ns" ]]; then + ns_exec="ip netns exec ${ns}" + fi + + ${ns_exec} vng \ --run \ ${kernel_opt} \ ${verbose_opt} \ @@ -297,6 +314,7 @@ vm_start() { } vm_wait_for_ssh() { + local ns=$1 local i i=0 @@ -304,7 +322,8 @@ vm_wait_for_ssh() { if [[ ${i} -gt ${WAIT_PERIOD_MAX} ]]; then die "Timed out waiting for guest ssh" fi - if vm_ssh -- true; then + + if vm_ssh "${ns}" -- true; then break fi i=$(( i + 1 )) @@ -338,30 +357,41 @@ wait_for_listener() } vm_wait_for_listener() { - local port=$1 + local ns=$1 + local port=$2 - vm_ssh <&1 | log_guest rc=$? else - vm_ssh -- "${VSOCK_TEST}" \ + vm_ssh "${ns}" -- "${VSOCK_TEST}" \ --mode=server \ --peer-cid="${cid}" \ --control-port="${port}" \ @@ -381,7 +411,7 @@ vm_vsock_test() { return $rc fi - vm_wait_for_listener "${port}" + vm_wait_for_listener "${ns}" "${port}" rc=$? fi set +o pipefail @@ -390,22 +420,28 @@ vm_vsock_test() { } host_vsock_test() { - local host=$1 - local cid=$2 - local port=$3 + local ns=$1 + local host=$2 + local cid=$3 + local port=$4 local rc + local cmd="${VSOCK_TEST}" + if [[ "${ns}" != "init_ns" ]]; then + cmd="ip netns exec ${ns} ${cmd}" + fi + # log output and use pipefail to respect vsock_test errors set -o pipefail if [[ "${host}" != server ]]; then - ${VSOCK_TEST} \ + ${cmd} \ --mode=client \ --peer-cid="${cid}" \ --control-host="${host}" \ --control-port="${port}" 2>&1 | log_host rc=$? else - ${VSOCK_TEST} \ + ${cmd} \ --mode=server \ --peer-cid="${cid}" \ --control-port="${port}" 2>&1 | log_host & @@ -416,7 +452,7 @@ host_vsock_test() { return $rc fi - host_wait_for_listener "${port}" + host_wait_for_listener "${ns}" "${port}" rc=$? fi set +o pipefail @@ -460,11 +496,11 @@ log_guest() { } test_vm_server_host_client() { - if ! vm_vsock_test "server" 2 "${TEST_GUEST_PORT}"; then + if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then return "${KSFT_FAIL}" fi - if ! host_vsock_test "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then + if ! host_vsock_test "init_ns" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then return "${KSFT_FAIL}" fi @@ -472,11 +508,11 @@ test_vm_server_host_client() { } test_vm_client_host_server() { - if ! host_vsock_test "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then + if ! host_vsock_test "init_ns" "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then return "${KSFT_FAIL}" fi - if ! vm_vsock_test "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then + if ! vm_vsock_test "init_ns" "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then return "${KSFT_FAIL}" fi @@ -486,13 +522,14 @@ test_vm_client_host_server() { test_vm_loopback() { local port=60000 # non-forwarded local port - vm_ssh -- modprobe vsock_loopback &> /dev/null || : + vm_ssh "init_ns" -- modprobe vsock_loopback &> /dev/null || : - if ! vm_vsock_test "server" 1 "${port}"; then + if ! vm_vsock_test "init_ns" "server" 1 "${port}"; then return "${KSFT_FAIL}" fi - if ! vm_vsock_test "127.0.0.1" 1 "${port}"; then + + if ! vm_vsock_test "init_ns" "127.0.0.1" 1 "${port}"; then return "${KSFT_FAIL}" fi @@ -550,8 +587,8 @@ run_shared_vm_test() { host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') - vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') + vm_oops_cnt_before=$(vm_ssh "init_ns" -- dmesg | grep -c -i 'Oops') + vm_warn_cnt_before=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -569,13 +606,13 @@ run_shared_vm_test() { rc=$KSFT_FAIL fi - vm_oops_cnt_after=$(vm_ssh -- dmesg | grep -i 'Oops' | wc -l) + vm_oops_cnt_after=$(vm_ssh "init_ns" -- dmesg | grep -i 'Oops' | wc -l) if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then echo "FAIL: kernel oops detected on vm" | log_host rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') + vm_warn_cnt_after=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host rc=$KSFT_FAIL @@ -621,8 +658,8 @@ cnt_total=0 if shared_vm_tests_requested "${ARGS[@]}"; then log_host "Booting up VM" pidfile="$(create_pidfile)" - vm_start "${pidfile}" - vm_wait_for_ssh + vm_start "${pidfile}" "init_ns" + vm_wait_for_ssh "init_ns" log_host "VM booted up" run_shared_vm_tests "${ARGS[@]}" -- cgit v1.2.3 From 4e870ac81df7e9628bdc08ff6f957a42e80582ee Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:47 -0800 Subject: selftests/vsock: add vm_dmesg_{warn,oops}_count() helpers These functions are reused by the VM tests to collect and compare dmesg warnings and oops counts. The future VM-specific tests use them heavily. This patches relies on vm_ssh() already supporting namespaces. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-7-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c4d73dd0a4cf..4b5929ffc9eb 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -380,6 +380,17 @@ host_wait_for_listener() { fi } +vm_dmesg_oops_count() { + local ns=$1 + + vm_ssh "${ns}" -- dmesg 2>/dev/null | grep -c -i 'Oops' +} + +vm_dmesg_warn_count() { + local ns=$1 + + vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock' +} vm_vsock_test() { local ns=$1 @@ -587,8 +598,8 @@ run_shared_vm_test() { host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') - vm_oops_cnt_before=$(vm_ssh "init_ns" -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') + vm_oops_cnt_before=$(vm_dmesg_oops_count "init_ns") + vm_warn_cnt_before=$(vm_dmesg_warn_count "init_ns") name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -606,13 +617,13 @@ run_shared_vm_test() { rc=$KSFT_FAIL fi - vm_oops_cnt_after=$(vm_ssh "init_ns" -- dmesg | grep -i 'Oops' | wc -l) + vm_oops_cnt_after=$(vm_dmesg_oops_count "init_ns") if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then echo "FAIL: kernel oops detected on vm" | log_host rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') + vm_warn_cnt_after=$(vm_dmesg_warn_count "init_ns") if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host rc=$KSFT_FAIL -- cgit v1.2.3 From 7418f3bb3aa289fbf52f93b551e79ba647371f51 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:48 -0800 Subject: selftests/vsock: use ss to wait for listeners instead of /proc/net Replace /proc/net parsing with ss(8) for detecting listening sockets in wait_for_listener() functions and add support for TCP, VSOCK, and Unix socket protocols. The previous implementation parsed /proc/net/tcp using awk to detect listening sockets, but this approach could not support vsock because vsock does not export socket information to /proc/net/. Instead, use ss so that we can detect listeners on tcp, vsock, and unix. The protocol parameter is now required for all wait_for_listener family functions (wait_for_listener, vm_wait_for_listener, host_wait_for_listener) to explicitly specify which socket type to wait for. ss is added to the dependency check in check_deps(). Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-8-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 47 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 4b5929ffc9eb..0e681d4c3a15 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -182,7 +182,7 @@ check_args() { } check_deps() { - for dep in vng ${QEMU} busybox pkill ssh; do + for dep in vng ${QEMU} busybox pkill ssh ss; do if [[ ! -x $(command -v "${dep}") ]]; then echo -e "skip: dependency ${dep} not found!\n" exit "${KSFT_SKIP}" @@ -337,21 +337,32 @@ wait_for_listener() local port=$1 local interval=$2 local max_intervals=$3 - local protocol=tcp - local pattern + local protocol=$4 local i - pattern=":$(printf "%04X" "${port}") " - - # for tcp protocol additionally check the socket state - [ "${protocol}" = "tcp" ] && pattern="${pattern}0A" - for i in $(seq "${max_intervals}"); do - if awk -v pattern="${pattern}" \ - 'BEGIN {rc=1} $2" "$4 ~ pattern {rc=0} END {exit rc}' \ - /proc/net/"${protocol}"*; then + case "${protocol}" in + tcp) + if ss --listening --tcp --numeric | grep -q ":${port} "; then + break + fi + ;; + vsock) + if ss --listening --vsock --numeric | grep -q ":${port} "; then + break + fi + ;; + unix) + # For unix sockets, port is actually the socket path + if ss --listening --unix | grep -q "${port}"; then + break + fi + ;; + *) + echo "Unknown protocol: ${protocol}" >&2 break - fi + ;; + esac sleep "${interval}" done } @@ -359,23 +370,25 @@ wait_for_listener() vm_wait_for_listener() { local ns=$1 local port=$2 + local protocol=$3 vm_ssh "${ns}" < Date: Wed, 21 Jan 2026 14:11:49 -0800 Subject: selftests/vsock: add tests for proc sys vsock ns_mode Add tests for the /proc/sys/net/vsock/{ns_mode,child_ns_mode} interfaces. Namely, that they accept/report "global" and "local" strings and enforce their access policies. Start a convention of commenting the test name over the test description. Add test name comments over test descriptions that existed before this convention. Add a check_netns() function that checks if the test requires namespaces and if the current kernel supports namespaces. Skip tests that require namespaces if the system does not have namespace support. This patch is the first to add tests that do *not* re-use the same shared VM. For that reason, it adds a run_ns_tests() function to run these tests and filter out the shared VM tests. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-9-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 140 +++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 0e681d4c3a15..38785a102236 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -41,14 +41,38 @@ readonly KERNEL_CMDLINE="\ virtme.ssh virtme_ssh_channel=tcp virtme_ssh_user=$USER \ " readonly LOG=$(mktemp /tmp/vsock_vmtest_XXXX.log) -readonly TEST_NAMES=(vm_server_host_client vm_client_host_server vm_loopback) + +# Namespace tests must use the ns_ prefix. This is checked in check_netns() and +# is used to determine if a test needs namespace setup before test execution. +readonly TEST_NAMES=( + vm_server_host_client + vm_client_host_server + vm_loopback + ns_host_vsock_ns_mode_ok + ns_host_vsock_child_ns_mode_ok +) readonly TEST_DESCS=( + # vm_server_host_client "Run vsock_test in server mode on the VM and in client mode on the host." + + # vm_client_host_server "Run vsock_test in client mode on the VM and in server mode on the host." + + # vm_loopback "Run vsock_test using the loopback transport in the VM." + + # ns_host_vsock_ns_mode_ok + "Check /proc/sys/net/vsock/ns_mode strings on the host." + + # ns_host_vsock_child_ns_mode_ok + "Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable." ) -readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback) +readonly USE_SHARED_VM=( + vm_server_host_client + vm_client_host_server + vm_loopback +) readonly NS_MODES=("local" "global") VERBOSE=0 @@ -196,6 +220,20 @@ check_deps() { fi } +check_netns() { + local tname=$1 + + # If the test requires NS support, check if NS support exists + # using /proc/self/ns + if [[ "${tname}" =~ ^ns_ ]] && + [[ ! -e /proc/self/ns ]]; then + log_host "No NS support detected for test ${tname}" + return 1 + fi + + return 0 +} + check_vng() { local tested_versions local version @@ -519,6 +557,54 @@ log_guest() { LOG_PREFIX=guest log "$@" } +ns_get_mode() { + local ns=$1 + + ip netns exec "${ns}" cat /proc/sys/net/vsock/ns_mode 2>/dev/null +} + +test_ns_host_vsock_ns_mode_ok() { + for mode in "${NS_MODES[@]}"; do + local actual + + actual=$(ns_get_mode "${mode}0") + if [[ "${actual}" != "${mode}" ]]; then + log_host "expected mode ${mode}, got ${actual}" + return "${KSFT_FAIL}" + fi + done + + return "${KSFT_PASS}" +} + +test_ns_host_vsock_child_ns_mode_ok() { + local orig_mode + local rc + + orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode) + + rc="${KSFT_PASS}" + for mode in "${NS_MODES[@]}"; do + local ns="${mode}0" + + if echo "${mode}" 2>/dev/null > /proc/sys/net/vsock/ns_mode; then + log_host "ns_mode should be read-only but write succeeded" + rc="${KSFT_FAIL}" + continue + fi + + if ! echo "${mode}" > /proc/sys/net/vsock/child_ns_mode; then + log_host "child_ns_mode should be writable to ${mode}" + rc="${KSFT_FAIL}" + continue + fi + done + + echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode + + return "${rc}" +} + test_vm_server_host_client() { if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then return "${KSFT_FAIL}" @@ -592,6 +678,11 @@ run_shared_vm_tests() { continue fi + if ! check_netns "${arg}"; then + check_result "${KSFT_SKIP}" "${arg}" + continue + fi + run_shared_vm_test "${arg}" check_result "$?" "${arg}" done @@ -645,6 +736,49 @@ run_shared_vm_test() { return "${rc}" } +run_ns_tests() { + for arg in "${ARGS[@]}"; do + if shared_vm_test "${arg}"; then + continue + fi + + if ! check_netns "${arg}"; then + check_result "${KSFT_SKIP}" "${arg}" + continue + fi + + add_namespaces + + name=$(echo "${arg}" | awk '{ print $1 }') + log_host "Executing test_${name}" + + host_oops_before=$(dmesg 2>/dev/null | grep -c -i 'Oops') + host_warn_before=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock') + eval test_"${name}" + rc=$? + + host_oops_after=$(dmesg 2>/dev/null | grep -c -i 'Oops') + if [[ "${host_oops_after}" -gt "${host_oops_before}" ]]; then + echo "FAIL: kernel oops detected on host" | log_host + check_result "${KSFT_FAIL}" "${name}" + del_namespaces + continue + fi + + host_warn_after=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock') + if [[ "${host_warn_after}" -gt "${host_warn_before}" ]]; then + echo "FAIL: kernel warning detected on host" | log_host + check_result "${KSFT_FAIL}" "${name}" + del_namespaces + continue + fi + + check_result "${rc}" "${name}" + + del_namespaces + done +} + BUILD=0 QEMU="qemu-system-$(uname -m)" @@ -690,6 +824,8 @@ if shared_vm_tests_requested "${ARGS[@]}"; then terminate_pidfiles "${pidfile}" fi +run_ns_tests "${ARGS[@]}" + echo "SUMMARY: PASS=${cnt_pass} SKIP=${cnt_skip} FAIL=${cnt_fail}" echo "Log: ${LOG}" -- cgit v1.2.3 From 605caec5adc2956263a86b48eecfc52ee5c95dae Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:50 -0800 Subject: selftests/vsock: add namespace tests for CID collisions Add tests to verify CID collision rules across different vsock namespace modes. 1. Two VMs with the same CID cannot start in different global namespaces (ns_global_same_cid_fails) 2. Two VMs with the same CID can start in different local namespaces (ns_local_same_cid_ok) 3. VMs with the same CID can coexist when one is in a global namespace and another is in a local namespace (ns_global_local_same_cid_ok and ns_local_global_same_cid_ok) The tests ns_global_local_same_cid_ok and ns_local_global_same_cid_ok make sure that ordering does not matter. The tests use a shared helper function namespaces_can_boot_same_cid() that attempts to start two VMs with identical CIDs in the specified namespaces and verifies whether VM initialization failed or succeeded. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-10-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 78 +++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 38785a102236..1bf537410ea6 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -50,6 +50,10 @@ readonly TEST_NAMES=( vm_loopback ns_host_vsock_ns_mode_ok ns_host_vsock_child_ns_mode_ok + ns_global_same_cid_fails + ns_local_same_cid_ok + ns_global_local_same_cid_ok + ns_local_global_same_cid_ok ) readonly TEST_DESCS=( # vm_server_host_client @@ -66,6 +70,18 @@ readonly TEST_DESCS=( # ns_host_vsock_child_ns_mode_ok "Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable." + + # ns_global_same_cid_fails + "Check QEMU fails to start two VMs with same CID in two different global namespaces." + + # ns_local_same_cid_ok + "Check QEMU successfully starts two VMs with same CID in two different local namespaces." + + # ns_global_local_same_cid_ok + "Check QEMU successfully starts one VM in a global ns and then another VM in a local ns with the same CID." + + # ns_local_global_same_cid_ok + "Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID." ) readonly USE_SHARED_VM=( @@ -577,6 +593,68 @@ test_ns_host_vsock_ns_mode_ok() { return "${KSFT_PASS}" } +namespaces_can_boot_same_cid() { + local ns0=$1 + local ns1=$2 + local pidfile1 pidfile2 + local rc + + pidfile1="$(create_pidfile)" + + # The first VM should be able to start. If it can't then we have + # problems and need to return non-zero. + if ! vm_start "${pidfile1}" "${ns0}"; then + return 1 + fi + + pidfile2="$(create_pidfile)" + vm_start "${pidfile2}" "${ns1}" + rc=$? + terminate_pidfiles "${pidfile1}" "${pidfile2}" + + return "${rc}" +} + +test_ns_global_same_cid_fails() { + init_namespaces + + if namespaces_can_boot_same_cid "global0" "global1"; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_local_global_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "local0" "global0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_global_local_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "global0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_local_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "local0" "local1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + test_ns_host_vsock_child_ns_mode_ok() { local orig_mode local rc -- cgit v1.2.3 From 0424ee7c3a1721c099544f36580cf6dd1661856d Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:51 -0800 Subject: selftests/vsock: add tests for host <-> vm connectivity with namespaces Add tests to validate namespace correctness using vsock_test and socat. The vsock_test tool is used to validate expected success tests, but socat is used for expected failure tests. socat is used to ensure that connections are rejected outright instead of failing due to some other socket behavior (as tested in vsock_test). Additionally, socat is already required for tunneling TCP traffic from vsock_test. Using only one of the vsock_test tests like 'test_stream_client_close_client' would have yielded a similar result, but doing so wouldn't remove the socat dependency. Additionally, check for the dependency socat. socat needs special handling beyond just checking if it is on the path because it must be compiled with support for both vsock and unix. The function check_socat() checks that this support exists. Add more padding to test name printf strings because the tests added in this patch would otherwise overflow. Add vm_dmesg_* helpers to encapsulate checking dmesg for oops and warnings. Add ability to pass extra args to host-side vsock_test so that tests that cause false positives may be skipped with arg --skip. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-11-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 572 +++++++++++++++++++++++++++++++- 1 file changed, 568 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 1bf537410ea6..a9eaf37bc31b 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -7,6 +7,7 @@ # * virtme-ng # * busybox-static (used by virtme-ng) # * qemu (used by virtme-ng) +# * socat # # shellcheck disable=SC2317,SC2119 @@ -54,6 +55,19 @@ readonly TEST_NAMES=( ns_local_same_cid_ok ns_global_local_same_cid_ok ns_local_global_same_cid_ok + ns_diff_global_host_connect_to_global_vm_ok + ns_diff_global_host_connect_to_local_vm_fails + ns_diff_global_vm_connect_to_global_host_ok + ns_diff_global_vm_connect_to_local_host_fails + ns_diff_local_host_connect_to_local_vm_fails + ns_diff_local_vm_connect_to_local_host_fails + ns_diff_global_to_local_loopback_local_fails + ns_diff_local_to_global_loopback_fails + ns_diff_local_to_local_loopback_fails + ns_diff_global_to_global_loopback_ok + ns_same_local_loopback_ok + ns_same_local_host_connect_to_local_vm_ok + ns_same_local_vm_connect_to_local_host_ok ) readonly TEST_DESCS=( # vm_server_host_client @@ -82,6 +96,45 @@ readonly TEST_DESCS=( # ns_local_global_same_cid_ok "Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID." + + # ns_diff_global_host_connect_to_global_vm_ok + "Run vsock_test client in global ns with server in VM in another global ns." + + # ns_diff_global_host_connect_to_local_vm_fails + "Run socat to test a process in a global ns fails to connect to a VM in a local ns." + + # ns_diff_global_vm_connect_to_global_host_ok + "Run vsock_test client in VM in a global ns with server in another global ns." + + # ns_diff_global_vm_connect_to_local_host_fails + "Run socat to test a VM in a global ns fails to connect to a host process in a local ns." + + # ns_diff_local_host_connect_to_local_vm_fails + "Run socat to test a host process in a local ns fails to connect to a VM in another local ns." + + # ns_diff_local_vm_connect_to_local_host_fails + "Run socat to test a VM in a local ns fails to connect to a host process in another local ns." + + # ns_diff_global_to_local_loopback_local_fails + "Run socat to test a loopback vsock in a global ns fails to connect to a vsock in a local ns." + + # ns_diff_local_to_global_loopback_fails + "Run socat to test a loopback vsock in a local ns fails to connect to a vsock in a global ns." + + # ns_diff_local_to_local_loopback_fails + "Run socat to test a loopback vsock in a local ns fails to connect to a vsock in another local ns." + + # ns_diff_global_to_global_loopback_ok + "Run socat to test a loopback vsock in a global ns successfully connects to a vsock in another global ns." + + # ns_same_local_loopback_ok + "Run socat to test a loopback vsock in a local ns successfully connects to a vsock in the same ns." + + # ns_same_local_host_connect_to_local_vm_ok + "Run vsock_test client in a local ns with server in VM in same ns." + + # ns_same_local_vm_connect_to_local_host_ok + "Run vsock_test client in VM in a local ns with server in same ns." ) readonly USE_SHARED_VM=( @@ -112,7 +165,7 @@ usage() { for ((i = 0; i < ${#TEST_NAMES[@]}; i++)); do name=${TEST_NAMES[${i}]} desc=${TEST_DESCS[${i}]} - printf "\t%-35s%-35s\n" "${name}" "${desc}" + printf "\t%-55s%-35s\n" "${name}" "${desc}" done echo @@ -222,7 +275,7 @@ check_args() { } check_deps() { - for dep in vng ${QEMU} busybox pkill ssh ss; do + for dep in vng ${QEMU} busybox pkill ssh ss socat; do if [[ ! -x $(command -v "${dep}") ]]; then echo -e "skip: dependency ${dep} not found!\n" exit "${KSFT_SKIP}" @@ -273,6 +326,20 @@ check_vng() { fi } +check_socat() { + local support_string + + support_string="$(socat -V)" + + if [[ "${support_string}" != *"WITH_VSOCK 1"* ]]; then + die "err: socat is missing vsock support" + fi + + if [[ "${support_string}" != *"WITH_UNIX 1"* ]]; then + die "err: socat is missing unix support" + fi +} + handle_build() { if [[ ! "${BUILD}" -eq 1 ]]; then return @@ -321,6 +388,14 @@ terminate_pidfiles() { done } +terminate_pids() { + local pid + + for pid in "$@"; do + kill -SIGTERM "${pid}" &>/dev/null || : + done +} + vm_start() { local pidfile=$1 local ns=$2 @@ -459,6 +534,28 @@ vm_dmesg_warn_count() { vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock' } +vm_dmesg_check() { + local pidfile=$1 + local ns=$2 + local oops_before=$3 + local warn_before=$4 + local oops_after warn_after + + oops_after=$(vm_dmesg_oops_count "${ns}") + if [[ "${oops_after}" -gt "${oops_before}" ]]; then + echo "FAIL: kernel oops detected on vm in ns ${ns}" | log_host + return 1 + fi + + warn_after=$(vm_dmesg_warn_count "${ns}") + if [[ "${warn_after}" -gt "${warn_before}" ]]; then + echo "FAIL: kernel warning detected on vm in ns ${ns}" | log_host + return 1 + fi + + return 0 +} + vm_vsock_test() { local ns=$1 local host=$2 @@ -502,6 +599,8 @@ host_vsock_test() { local host=$2 local cid=$3 local port=$4 + shift 4 + local extra_args=("$@") local rc local cmd="${VSOCK_TEST}" @@ -516,13 +615,15 @@ host_vsock_test() { --mode=client \ --peer-cid="${cid}" \ --control-host="${host}" \ - --control-port="${port}" 2>&1 | log_host + --control-port="${port}" \ + "${extra_args[@]}" 2>&1 | log_host rc=$? else ${cmd} \ --mode=server \ --peer-cid="${cid}" \ - --control-port="${port}" 2>&1 | log_host & + --control-port="${port}" \ + "${extra_args[@]}" 2>&1 | log_host & rc=$? if [[ $rc -ne 0 ]]; then @@ -593,6 +694,468 @@ test_ns_host_vsock_ns_mode_ok() { return "${KSFT_PASS}" } +test_ns_diff_global_host_connect_to_global_vm_ok() { + local oops_before warn_before + local pids pid pidfile + local ns0 ns1 port + declare -a pids + local unixfile + ns0="global0" + ns1="global1" + port=1234 + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns0}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + unixfile=$(mktemp -u /tmp/XXXX.sock) + ip netns exec "${ns1}" \ + socat TCP-LISTEN:"${TEST_HOST_PORT}",fork \ + UNIX-CONNECT:"${unixfile}" & + pids+=($!) + host_wait_for_listener "${ns1}" "${TEST_HOST_PORT}" "tcp" + + ip netns exec "${ns0}" socat UNIX-LISTEN:"${unixfile}",fork \ + TCP-CONNECT:localhost:"${TEST_HOST_PORT}" & + pids+=($!) + host_wait_for_listener "${ns0}" "${unixfile}" "unix" + + vm_vsock_test "${ns0}" "server" 2 "${TEST_GUEST_PORT}" + vm_wait_for_listener "${ns0}" "${TEST_GUEST_PORT}" "tcp" + host_vsock_test "${ns1}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pids "${pids[@]}" + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_diff_global_host_connect_to_local_vm_fails() { + local oops_before warn_before + local ns0="global0" + local ns1="local0" + local port=12345 + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + outfile=$(mktemp) + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns1}"; then + log_host "failed to start vm (cid=${VSOCK_CID}, ns=${ns0})" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns1}" + oops_before=$(vm_dmesg_oops_count "${ns1}") + warn_before=$(vm_dmesg_warn_count "${ns1}") + + vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" & + vm_wait_for_listener "${ns1}" "${port}" "vsock" + echo TEST | ip netns exec "${ns0}" \ + socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null + + vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" == "TEST" ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_diff_global_vm_connect_to_global_host_ok() { + local oops_before warn_before + local ns0="global0" + local ns1="global1" + local port=12345 + local unixfile + local dmesg_rc + local pidfile + local pids + local rc + + init_namespaces + + declare -a pids + + log_host "Setup socat bridge from ns ${ns0} to ns ${ns1} over port ${port}" + + unixfile=$(mktemp -u /tmp/XXXX.sock) + + ip netns exec "${ns0}" \ + socat TCP-LISTEN:"${port}" UNIX-CONNECT:"${unixfile}" & + pids+=($!) + host_wait_for_listener "${ns0}" "${port}" "tcp" + + ip netns exec "${ns1}" \ + socat UNIX-LISTEN:"${unixfile}" TCP-CONNECT:127.0.0.1:"${port}" & + pids+=($!) + host_wait_for_listener "${ns1}" "${unixfile}" "unix" + + log_host "Launching ${VSOCK_TEST} in ns ${ns1}" + host_vsock_test "${ns1}" "server" "${VSOCK_CID}" "${port}" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + terminate_pids "${pids[@]}" + rm -f "${unixfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_vsock_test "${ns0}" "10.0.2.2" 2 "${port}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pids[@]}" + rm -f "${unixfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" + +} + +test_ns_diff_global_vm_connect_to_local_host_fails() { + local ns0="global0" + local ns1="local0" + local port=12345 + local oops_before warn_before + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + terminate_pids "${pid}" + rm -f "${outfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_ssh "${ns0}" -- \ + bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_host_connect_to_local_vm_fails() { + local ns0="local0" + local ns1="local1" + local port=12345 + local oops_before warn_before + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + outfile=$(mktemp) + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns1}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns1}" + oops_before=$(vm_dmesg_oops_count "${ns1}") + warn_before=$(vm_dmesg_warn_count "${ns1}") + + vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" & + vm_wait_for_listener "${ns1}" "${port}" "vsock" + + echo TEST | ip netns exec "${ns0}" \ + socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null + + vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_vm_connect_to_local_host_fails() { + local oops_before warn_before + local ns0="local0" + local ns1="local1" + local port=12345 + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + rm -f "${outfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_ssh "${ns0}" -- \ + bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +__test_loopback_two_netns() { + local ns0=$1 + local ns1=$2 + local port=12345 + local result + local pid + + modprobe vsock_loopback &> /dev/null || : + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" 2>/dev/null & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + log_host "Launching socat in ns ${ns0}" + echo TEST | ip netns exec "${ns0}" socat STDIN VSOCK-CONNECT:1:"${port}" 2>/dev/null + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" == TEST ]]; then + return 0 + fi + + return 1 +} + +test_ns_diff_global_to_local_loopback_local_fails() { + init_namespaces + + if ! __test_loopback_two_netns "global0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_to_global_loopback_fails() { + init_namespaces + + if ! __test_loopback_two_netns "local0" "global0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_to_local_loopback_fails() { + init_namespaces + + if ! __test_loopback_two_netns "local0" "local1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_global_to_global_loopback_ok() { + init_namespaces + + if __test_loopback_two_netns "global0" "global1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_same_local_loopback_ok() { + init_namespaces + + if __test_loopback_two_netns "local0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_same_local_host_connect_to_local_vm_ok() { + local oops_before warn_before + local ns="local0" + local port=1234 + local dmesg_rc + local pidfile + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns}" + oops_before=$(vm_dmesg_oops_count "${ns}") + warn_before=$(vm_dmesg_warn_count "${ns}") + + vm_vsock_test "${ns}" "server" 2 "${TEST_GUEST_PORT}" + + # Skip test 29 (transport release use-after-free): This test attempts + # binding both G2H and H2G CIDs. Because virtio-vsock (G2H) doesn't + # support local namespaces the test will fail when + # transport_g2h->stream_allow() returns false. This edge case only + # happens for vsock_test in client mode on the host in a local + # namespace. This is a false positive. + host_vsock_test "${ns}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" --skip=29 + rc=$? + + vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_same_local_vm_connect_to_local_host_ok() { + local oops_before warn_before + local ns="local0" + local port=1234 + local dmesg_rc + local pidfile + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns}" + oops_before=$(vm_dmesg_oops_count "${ns}") + warn_before=$(vm_dmesg_warn_count "${ns}") + + host_vsock_test "${ns}" "server" "${VSOCK_CID}" "${port}" + vm_vsock_test "${ns}" "10.0.2.2" 2 "${port}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + namespaces_can_boot_same_cid() { local ns0=$1 local ns1=$2 @@ -882,6 +1445,7 @@ fi check_args "${ARGS[@]}" check_deps check_vng +check_socat handle_build echo "1..${#ARGS[@]}" -- cgit v1.2.3 From b3b7b33264c69a3a97723c31a14c7a19b2fce503 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:52 -0800 Subject: selftests/vsock: add tests for namespace deletion Add tests that validate vsock sockets are resilient to deleting namespaces. The vsock sockets should still function normally. The function check_ns_delete_doesnt_break_connection() is added to re-use the step-by-step logic of 1) setup connections, 2) delete ns, 3) check that the connections are still ok. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-12-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 84 +++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index a9eaf37bc31b..dc8dbe74a6d0 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -68,6 +68,9 @@ readonly TEST_NAMES=( ns_same_local_loopback_ok ns_same_local_host_connect_to_local_vm_ok ns_same_local_vm_connect_to_local_host_ok + ns_delete_vm_ok + ns_delete_host_ok + ns_delete_both_ok ) readonly TEST_DESCS=( # vm_server_host_client @@ -135,6 +138,15 @@ readonly TEST_DESCS=( # ns_same_local_vm_connect_to_local_host_ok "Run vsock_test client in VM in a local ns with server in same ns." + + # ns_delete_vm_ok + "Check that deleting the VM's namespace does not break the socket connection" + + # ns_delete_host_ok + "Check that deleting the host's namespace does not break the socket connection" + + # ns_delete_both_ok + "Check that deleting the VM and host's namespaces does not break the socket connection" ) readonly USE_SHARED_VM=( @@ -1287,6 +1299,78 @@ test_vm_loopback() { return "${KSFT_PASS}" } +check_ns_delete_doesnt_break_connection() { + local pipefile pidfile outfile + local ns0="global0" + local ns1="global1" + local port=12345 + local pids=() + local rc=0 + + init_namespaces + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + return "${KSFT_FAIL}" + fi + vm_wait_for_ssh "${ns0}" + + outfile=$(mktemp) + vm_ssh "${ns0}" -- \ + socat VSOCK-LISTEN:"${port}",fork STDOUT > "${outfile}" 2>/dev/null & + pids+=($!) + vm_wait_for_listener "${ns0}" "${port}" "vsock" + + # We use a pipe here so that we can echo into the pipe instead of using + # socat and a unix socket file. We just need a name for the pipe (not a + # regular file) so use -u. + pipefile=$(mktemp -u /tmp/vmtest_pipe_XXXX) + ip netns exec "${ns1}" \ + socat PIPE:"${pipefile}" VSOCK-CONNECT:"${VSOCK_CID}":"${port}" & + pids+=($!) + + timeout "${WAIT_PERIOD}" \ + bash -c 'while [[ ! -e '"${pipefile}"' ]]; do sleep 1; done; exit 0' + + if [[ "$1" == "vm" ]]; then + ip netns del "${ns0}" + elif [[ "$1" == "host" ]]; then + ip netns del "${ns1}" + elif [[ "$1" == "both" ]]; then + ip netns del "${ns0}" + ip netns del "${ns1}" + fi + + echo "TEST" > "${pipefile}" + + timeout "${WAIT_PERIOD}" \ + bash -c 'while [[ ! -s '"${outfile}"' ]]; do sleep 1; done; exit 0' + + if grep -q "TEST" "${outfile}"; then + rc="${KSFT_PASS}" + else + rc="${KSFT_FAIL}" + fi + + terminate_pidfiles "${pidfile}" + terminate_pids "${pids[@]}" + rm -f "${outfile}" "${pipefile}" + + return "${rc}" +} + +test_ns_delete_vm_ok() { + check_ns_delete_doesnt_break_connection "vm" +} + +test_ns_delete_host_ok() { + check_ns_delete_doesnt_break_connection "host" +} + +test_ns_delete_both_ok() { + check_ns_delete_doesnt_break_connection "both" +} + shared_vm_test() { local tname -- cgit v1.2.3 From fd4eeb30b9e30ca1118a618be0755287bcbb2da9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 26 Jan 2026 04:57:35 +0000 Subject: objtool: Print bfd_vma as unsigned long long on ia32-x86_64 cross build When objtool is cross-compiled in ia32 container for x86_64 target it fails with the following errors: > disas.c: In function 'disas_print_addr_sym': > disas.c:173:38: error: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'bfd_vma' {aka 'long long unsigned int'} [-Werror=format=] > 173 | DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); > | ^~~~~~~~~~~~ ~~~~ > | | > | bfd_vma {aka long long unsigned int} Provide a correct printf-fmt depending on sizeof(bfd_vma). Fixes: 5d859dff266f ("objtool: Print symbol during disassembly") Signed-off-by: Dmitry Safonov Reviewed-by: Alexandre Chartre Link: https://patch.msgid.link/20260126-objtool-ia32-v1-1-bb6feaf17566@arista.com Signed-off-by: Josh Poimboeuf --- tools/objtool/disas.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c index 2b5059f55e40..26f08d41f2b1 100644 --- a/tools/objtool/disas.c +++ b/tools/objtool/disas.c @@ -108,6 +108,8 @@ static int sprint_name(char *str, const char *name, unsigned long offset) #define DINFO_FPRINTF(dinfo, ...) \ ((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__)) +#define bfd_vma_fmt \ + __builtin_choose_expr(sizeof(bfd_vma) == sizeof(unsigned long), "%#lx <%s>", "%#llx <%s>") static int disas_result_fprintf(struct disas_context *dctx, const char *fmt, va_list ap) @@ -170,10 +172,10 @@ static void disas_print_addr_sym(struct section *sec, struct symbol *sym, if (sym) { sprint_name(symstr, sym->name, addr - sym->offset); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, symstr); } else { str = offstr(sec, addr); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, str); free(str); } } @@ -252,7 +254,7 @@ static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) * example: "lea 0x0(%rip),%rdi". The kernel can reference * the next IP with _THIS_IP_ macro. */ - DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, "_THIS_IP_"); return; } @@ -264,11 +266,11 @@ static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) */ if (reloc->sym->type == STT_SECTION) { str = offstr(reloc->sym->sec, reloc->sym->offset + offset); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, str); free(str); } else { sprint_name(symstr, reloc->sym->name, offset); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, symstr); } } @@ -311,7 +313,7 @@ static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo) */ sym = insn_call_dest(insn); if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) { - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, sym->name); return; } -- cgit v1.2.3 From d107b3265aa5e61a1e326b2815a767526ddb12ac Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 26 Jan 2026 16:13:48 +0100 Subject: objtool: Replace custom macros in elf.c with shared ones The source file tools/objtool/elf.c defines the macros ALIGN_UP(), ALIGN_UP_POW2() and MAX(). These macros unnecessarily duplicate functionality already available under tools/include/, specifically ALIGN(), roundup_pow_of_two() and max(). More importantly, the definition of ALIGN_UP_POW2() is incorrect when the input is 1, as it results in a call to __builtin_clz(0), which produces an undefined result. This issue impacts the function elf_alloc_reloc(). When adding the first relocation to a section, the function allocates an undefined number of relocations. Replace the custom macros with the shared functionality to resolve these issues. Fixes: 2c05ca026218 ("objtool: Add elf_create_reloc() and elf_init_reloc()") Signed-off-by: Petr Pavlu Link: https://patch.msgid.link/20260126151356.3924887-1-petr.pavlu@suse.com Signed-off-by: Josh Poimboeuf --- tools/objtool/elf.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 6a8ed9c62323..2c02c7b49265 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -18,15 +18,14 @@ #include #include #include +#include +#include #include +#include #include #include #include -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_UP_POW2(x) (1U << ((8 * sizeof(x)) - __builtin_clz((x) - 1U))) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - static inline u32 str_hash(const char *str) { return jhash(str, strlen(str), 0); @@ -1336,7 +1335,7 @@ unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char return -1; } - offset = ALIGN_UP(strtab->sh.sh_size, strtab->sh.sh_addralign); + offset = ALIGN(strtab->sh.sh_size, strtab->sh.sh_addralign); if (!elf_add_data(elf, strtab, str, strlen(str) + 1)) return -1; @@ -1378,7 +1377,7 @@ void *elf_add_data(struct elf *elf, struct section *sec, const void *data, size_ sec->data->d_size = size; sec->data->d_align = 1; - offset = ALIGN_UP(sec->sh.sh_size, sec->sh.sh_addralign); + offset = ALIGN(sec->sh.sh_size, sec->sh.sh_addralign); sec->sh.sh_size = offset + size; mark_sec_changed(elf, sec, true); @@ -1502,7 +1501,7 @@ static int elf_alloc_reloc(struct elf *elf, struct section *rsec) rsec->data->d_size = nr_relocs_new * elf_rela_size(elf); rsec->sh.sh_size = rsec->data->d_size; - nr_alloc = MAX(64, ALIGN_UP_POW2(nr_relocs_new)); + nr_alloc = max(64UL, roundup_pow_of_two(nr_relocs_new)); if (nr_alloc <= rsec->nr_alloc_relocs) return 0; -- cgit v1.2.3 From f2dba60339a6299e181671e95293efe312237e2d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 25 Jan 2026 21:56:39 -0800 Subject: objtool/klp: Fix bug table handling for __WARN_printf() Running objtool klp-diff on a changed function which uses WARN() can fail with: vmlinux.o: error: objtool: md_run+0x866: failed to convert reloc sym '__bug_table' to its proper format The problem is that since commit 5b472b6e5bd9 ("x86_64/bug: Implement __WARN_printf()"), each __WARN_printf() call site now directly references its bug table entry. klp-diff errors out when it can't convert such section-based references to object symbols (because bug table entries don't have symbols). Luckily, klp-diff already has code to create symbols for bug table entries. Move that code earlier, before function diffing. Fixes: dd590d4d57eb ("objtool/klp: Introduce klp diff subcommand for diffing object files") Fixes: 5b472b6e5bd9 ("x86_64/bug: Implement __WARN_printf()") Reported-by: Song Liu Tested-by: Song Liu Link: https://patch.msgid.link/a8e0a714b9da962858842b9aecd63b4900927c88.1769406850.git.jpoimboe@kernel.org Signed-off-by: Josh Poimboeuf --- tools/objtool/klp-diff.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c index 4d1f9e9977eb..d94531e3f64e 100644 --- a/tools/objtool/klp-diff.c +++ b/tools/objtool/klp-diff.c @@ -1425,9 +1425,6 @@ static int clone_special_sections(struct elfs *e) { struct section *patched_sec; - if (create_fake_symbols(e->patched)) - return -1; - for_each_sec(e->patched, patched_sec) { if (is_special_section(patched_sec)) { if (clone_special_section(e, patched_sec)) @@ -1704,6 +1701,17 @@ int cmd_klp_diff(int argc, const char **argv) if (!e.out) return -1; + /* + * Special section fake symbols are needed so that individual special + * section entries can be extracted by clone_special_sections(). + * + * Note the fake symbols are also needed by clone_included_functions() + * because __WARN_printf() call sites add references to bug table + * entries in the calling functions. + */ + if (create_fake_symbols(e.patched)) + return -1; + if (clone_included_functions(&e)) return -1; -- cgit v1.2.3 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup_iter.c | 26 +++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ show_order: seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { -- cgit v1.2.3 From 1456ebb291ddee67c9144c8f7f38a6dddcd32ed7 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:11 +0000 Subject: selftests/bpf: cover BPF_CGROUP_ITER_CHILDREN control option Extend some of the existing CSS iterator selftests such that they cover the newly introduced BPF_CGROUP_ITER_CHILDREN iterator control option. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/cgroup_iter.c | 12 ++++++++++++ tools/testing/selftests/bpf/prog_tests/iters.c | 8 +++++++- tools/testing/selftests/bpf/progs/iters_css.c | 9 ++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index 574d9a0cdc8e..0f88a9d00a22 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -190,6 +190,16 @@ static void test_walk_self_only(struct cgroup_iter *skel) BPF_CGROUP_ITER_SELF_ONLY, "self_only"); } +static void test_walk_children(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n" EPILOGUE, cg_id[CHILD1], + cg_id[CHILD2]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_CHILDREN, "children"); +} + static void test_walk_dead_self_only(struct cgroup_iter *skel) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); @@ -325,6 +335,8 @@ void test_cgroup_iter(void) test_walk_dead_self_only(skel); if (test__start_subtest("cgroup_iter__self_only_css_task")) test_walk_self_only_css_task(); + if (test__start_subtest("cgroup_iter__children")) + test_walk_children(skel); out: cgroup_iter__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index 3cea71f9c500..a539980a2fbe 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -253,6 +253,11 @@ static void subtest_css_iters(void) { "/cg1/cg2" }, { "/cg1/cg2/cg3" }, { "/cg1/cg2/cg3/cg4" }, + { "/cg1/cg5" }, + { "/cg1/cg5/cg6" }, + { "/cg1/cg7" }, + { "/cg1/cg7/cg8" }, + { "/cg1/cg7/cg8/cg9" }, }; int err, cg_nr = ARRAY_SIZE(cgs); int i; @@ -284,7 +289,8 @@ static void subtest_css_iters(void) ASSERT_EQ(skel->bss->post_order_cnt, cg_nr, "post_order_cnt"); ASSERT_EQ(skel->bss->last_cg_id, get_cgroup_id(cgs[0].path), "last_cg_id"); - ASSERT_EQ(skel->bss->tree_high, cg_nr - 1, "tree_high"); + ASSERT_EQ(skel->bss->children_cnt, 3, "children_cnt"); + ASSERT_EQ(skel->bss->tree_high, 3, "tree_high"); iters_css__detach(skel); cleanup: cleanup_cgroup_environment(); diff --git a/tools/testing/selftests/bpf/progs/iters_css.c b/tools/testing/selftests/bpf/progs/iters_css.c index ec1f6c2f590b..5a1d87d186a9 100644 --- a/tools/testing/selftests/bpf/progs/iters_css.c +++ b/tools/testing/selftests/bpf/progs/iters_css.c @@ -12,8 +12,7 @@ char _license[] SEC("license") = "GPL"; pid_t target_pid; u64 root_cg_id, leaf_cg_id; u64 first_cg_id, last_cg_id; - -int pre_order_cnt, post_order_cnt, tree_high; +int pre_order_cnt, post_order_cnt, children_cnt, tree_high; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; @@ -43,7 +42,7 @@ int iter_css_for_each(const void *ctx) } root_css = &root_cgrp->self; leaf_css = &leaf_cgrp->self; - pre_order_cnt = post_order_cnt = tree_high = 0; + pre_order_cnt = post_order_cnt = children_cnt = tree_high = 0; first_cg_id = last_cg_id = 0; bpf_rcu_read_lock(); @@ -60,6 +59,10 @@ int iter_css_for_each(const void *ctx) first_cg_id = cur_cgrp->kn->id; } + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_CHILDREN) { + children_cnt++; + } + bpf_for_each(css, pos, leaf_css, BPF_CGROUP_ITER_ANCESTORS_UP) tree_high++; -- cgit v1.2.3 From 17e2ce02bf5669dfa659976e93d409228cba98f9 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:45 +0800 Subject: selftests/bpf: Add tests for FIONREAD and copied_seq This commit adds two new test functions: one to reproduce the bug reported by syzkaller [1], and another to cover the calculation of copied_seq. The tests primarily involve installing and uninstalling sockmap on sockets, then reading data to verify proper functionality. Additionally, extend the do_test_sockmap_skb_verdict_fionread() function to support UDP FIONREAD testing. [1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-4-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 294 ++++++++++++++++++++- .../selftests/bpf/progs/test_sockmap_pass_prog.c | 14 + 2 files changed, 302 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1e3e4392dcca..256707e7d20d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include -#include +#include +#include #include #include "test_progs.h" @@ -22,6 +23,15 @@ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ +/** + * SOL_TCP is defined in (glibc), but the copybuf_address + * field of tcp_zerocopy_receive is not yet included in older versions. + * This workaround remains necessary until the glibc update propagates. + */ +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + static int connected_socket_v4(void) { struct sockaddr_in addr = { @@ -536,13 +546,14 @@ out: } -static void test_sockmap_skb_verdict_fionread(bool pass_prog) +static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog) { int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; int expected, zero = 0, sent, recvd, avail; struct test_sockmap_pass_prog *pass = NULL; struct test_sockmap_drop_prog *drop = NULL; char buf[256] = "0123456789"; + int split_len = sizeof(buf) / 2; if (pass_prog) { pass = test_sockmap_pass_prog__open_and_load(); @@ -550,7 +561,10 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) return; verdict = bpf_program__fd(pass->progs.prog_skb_verdict); map = bpf_map__fd(pass->maps.sock_map_rx); - expected = sizeof(buf); + if (sotype == SOCK_DGRAM) + expected = split_len; /* FIONREAD for UDP is different from TCP */ + else + expected = sizeof(buf); } else { drop = test_sockmap_drop_prog__open_and_load(); if (!ASSERT_OK_PTR(drop, "open_and_load")) @@ -566,7 +580,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); if (!ASSERT_OK(err, "create_socket_pairs()")) goto out; @@ -574,8 +588,9 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) goto out_close; - sent = xsend(p1, &buf, sizeof(buf), 0); - ASSERT_EQ(sent, sizeof(buf), "xsend(p0)"); + sent = xsend(p1, &buf, split_len, 0); + sent += xsend(p1, &buf, sizeof(buf) - split_len, 0); + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); err = ioctl(c1, FIONREAD, &avail); ASSERT_OK(err, "ioctl(FIONREAD) error"); ASSERT_EQ(avail, expected, "ioctl(FIONREAD)"); @@ -597,6 +612,12 @@ out: test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_fionread(bool pass_prog) +{ + do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog); + do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog); +} + static void test_sockmap_skb_verdict_change_tail(void) { struct test_sockmap_change_tail *skel; @@ -1042,6 +1063,257 @@ close_map: xclose(map); } +/* it is used to reproduce WARNING */ +static void test_sockmap_zc(void) +{ + int map, err, sent, recvd, zero = 0, one = 1, on = 1; + char buf[10] = "0123456789", rcv[11], addr[100]; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + struct tcp_zerocopy_receive zc; + socklen_t zc_len = sizeof(zc); + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend")) + goto end; + + /* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */ + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)")) + goto end; + + /* uninstall sockmap of p1 */ + bpf_map_delete_elem(map, &one); + + /* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */ + sent = xsend(c1, buf, sizeof(buf) - 1, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend")) + goto end; + + err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on)); + if (!ASSERT_OK(err, "setsockopt")) + goto end; + + memset(&zc, 0, sizeof(zc)); + zc.copybuf_address = (__u64)((unsigned long)addr); + zc.copybuf_len = sizeof(addr); + + err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); + if (!ASSERT_OK(err, "getsockopt")) + goto end; + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* it is used to check whether copied_seq of sk is correct */ +static void test_sockmap_copied_seq(bool strp) +{ + int i, map, err, sent, recvd, zero = 0, one = 1; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + if (strp) { + prog = skel->progs.prog_skb_verdict_ingress_strp; + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0); + if (!ASSERT_OK(err, "bpf_prog_attach parser")) + goto end; + } + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) + goto end; + + /* just trigger sockamp: data sent by c0 will be received by p1 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf")) + goto end; + + /* do partial read */ + recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1); + recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; + + /* uninstall sockmap of p1 and p0 */ + err = bpf_map_delete_elem(map, &one); + if (!ASSERT_OK(err, "bpf_map_delete_elem(1)")) + goto end; + + err = bpf_map_delete_elem(map, &zero); + if (!ASSERT_OK(err, "bpf_map_delete_elem(0)")) + goto end; + + /* now all sockets become plain socket, they should still work */ + for (i = 0; i < 5; i++) { + /* test copied_seq of p1 by running tcp native stack */ + sent = xsend(c1, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native")) + goto end; + + recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native")) + goto end; + + /* p0 previously redirected skb to p1, we also check copied_seq of p0 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native")) + goto end; + + recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native")) + goto end; + } + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* Wait until FIONREAD returns the expected value or timeout */ +static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int avail = 0; + + while (elapsed < timeout_ms) { + if (ioctl(fd, FIONREAD, &avail) < 0) + return -errno; + if (avail >= expected) + return avail; + usleep(1000); + elapsed++; + } + return avail; +} + +/* it is used to send data to via native stack and BPF redirecting */ +static void test_sockmap_multi_channels(int sotype) +{ + int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); + if (err) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + /* send data to p1 via native stack */ + sent = xsend(c1, buf, 2, 0); + if (!ASSERT_EQ(sent, 2, "xsend(2)")) + goto end; + + avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return"); + + /* send data to p1 via bpf redirecting */ + sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)")) + goto end; + + /* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable + * here since it may return immediately if prior data is already queued. + */ + expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf); + avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return"); + + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -1108,4 +1380,14 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_vsock_poll(); if (test__start_subtest("sockmap vsock unconnected")) test_sockmap_vsock_unconnected(); + if (test__start_subtest("sockmap with zc")) + test_sockmap_zc(); + if (test__start_subtest("sockmap recover")) + test_sockmap_copied_seq(false); + if (test__start_subtest("sockmap recover with strp")) + test_sockmap_copied_seq(true); + if (test__start_subtest("sockmap tcp multi channels")) + test_sockmap_multi_channels(SOCK_STREAM); + if (test__start_subtest("sockmap udp multi channels")) + test_sockmap_multi_channels(SOCK_DGRAM); } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 69aacc96db36..ef9edca184ea 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -44,4 +44,18 @@ int prog_skb_parser(struct __sk_buff *skb) return SK_PASS; } +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_ingress(struct __sk_buff *skb) +{ + int one = 1; + + return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS); +} + +SEC("sk_skb/stream_parser") +int prog_skb_verdict_ingress_strp(struct __sk_buff *skb) +{ + return skb->len; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 612e4022c616eba66ed15e6b7a9924251e0298e8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Jan 2026 12:43:43 -0300 Subject: perf strlist: Remove dont_dupstr logic, used only once Ian Rogers noticed that 678ed6b707e4b2db ("perf strlist: Don't write to const memory") breaks the 'Remove thread map' 'perf test' entry, because it keeps pointers to the temporary string introduced to avoid touching the const memory. This is because the thread_map__new_by_[pt]id_str() were the only methods using the slist->dont_dupstr knob to keep pointers to the original const string list, as it uses strtol to parse numbers and it stops at the comma. As this is the only case of dont_dupstr use, dupstr being the default, and it gets in the way of getting rid of the last const-correctness, remove this knob, with it: $ perf test 37 37: Remove thread map : Ok $ Fixes: 678ed6b707e4b2db ("perf strlist: Don't write to const memory") Reported-by: Ian Rogers Tested-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/strlist.c | 25 ++++++++----------------- tools/perf/util/strlist.h | 2 -- tools/perf/util/thread_map.c | 18 ++++++------------ 3 files changed, 14 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index 98883672fcf4..50add72575e0 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -12,20 +12,16 @@ #include static -struct rb_node *strlist__node_new(struct rblist *rblist, const void *entry) +struct rb_node *strlist__node_new(struct rblist *rblist __maybe_unused, const void *entry) { const char *s = entry; struct rb_node *rc = NULL; - struct strlist *strlist = container_of(rblist, struct strlist, rblist); struct str_node *snode = malloc(sizeof(*snode)); if (snode != NULL) { - if (strlist->dupstr) { - s = strdup(s); - if (s == NULL) - goto out_delete; - } - snode->s = s; + snode->s = strdup(s); + if (snode->s == NULL) + goto out_delete; rc = &snode->rb_node; } @@ -36,20 +32,18 @@ out_delete: return NULL; } -static void str_node__delete(struct str_node *snode, bool dupstr) +static void str_node__delete(struct str_node *snode) { - if (dupstr) - zfree((char **)&snode->s); + zfree((char **)&snode->s); free(snode); } static -void strlist__node_delete(struct rblist *rblist, struct rb_node *rb_node) +void strlist__node_delete(struct rblist *rblist __maybe_unused, struct rb_node *rb_node) { - struct strlist *slist = container_of(rblist, struct strlist, rblist); struct str_node *snode = container_of(rb_node, struct str_node, rb_node); - str_node__delete(snode, slist->dupstr); + str_node__delete(snode); } static int strlist__node_cmp(struct rb_node *rb_node, const void *entry) @@ -165,12 +159,10 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf struct strlist *slist = malloc(sizeof(*slist)); if (slist != NULL) { - bool dupstr = true; bool file_only = false; const char *dirname = NULL; if (config) { - dupstr = !config->dont_dupstr; dirname = config->dirname; file_only = config->file_only; } @@ -180,7 +172,6 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf slist->rblist.node_new = strlist__node_new; slist->rblist.node_delete = strlist__node_delete; - slist->dupstr = dupstr; slist->file_only = file_only; if (list && strlist__parse_list(slist, list, dirname) != 0) diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h index 7e82c71dcc42..3e9533e66ca9 100644 --- a/tools/perf/util/strlist.h +++ b/tools/perf/util/strlist.h @@ -14,7 +14,6 @@ struct str_node { struct strlist { struct rblist rblist; - bool dupstr; bool file_only; }; @@ -24,7 +23,6 @@ struct strlist { * found */ struct strlist_config { - bool dont_dupstr; bool file_only; const char *dirname; }; diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index ca193c1374ed..48c70f149e92 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -164,19 +164,16 @@ static struct perf_thread_map *thread_map__new_by_pid_str(const char *pid_str) struct dirent **namelist = NULL; int i, j = 0; pid_t pid, prev_pid = INT_MAX; - char *end_ptr; struct str_node *pos; - struct strlist_config slist_config = { .dont_dupstr = true, }; - struct strlist *slist = strlist__new(pid_str, &slist_config); + struct strlist *slist = strlist__new(pid_str, NULL); if (!slist) return NULL; strlist__for_each_entry(pos, slist) { - pid = strtol(pos->s, &end_ptr, 10); + pid = strtol(pos->s, NULL, 10); - if (pid == INT_MIN || pid == INT_MAX || - (*end_ptr != '\0' && *end_ptr != ',')) + if (pid == INT_MIN || pid == INT_MAX) goto out_free_threads; if (pid == prev_pid) @@ -223,24 +220,21 @@ struct perf_thread_map *thread_map__new_by_tid_str(const char *tid_str) struct perf_thread_map *threads = NULL, *nt; int ntasks = 0; pid_t tid, prev_tid = INT_MAX; - char *end_ptr; struct str_node *pos; - struct strlist_config slist_config = { .dont_dupstr = true, }; struct strlist *slist; /* perf-stat expects threads to be generated even if tid not given */ if (!tid_str) return perf_thread_map__new_dummy(); - slist = strlist__new(tid_str, &slist_config); + slist = strlist__new(tid_str, NULL); if (!slist) return NULL; strlist__for_each_entry(pos, slist) { - tid = strtol(pos->s, &end_ptr, 10); + tid = strtol(pos->s, NULL, 10); - if (tid == INT_MIN || tid == INT_MAX || - (*end_ptr != '\0' && *end_ptr != ',')) + if (tid == INT_MIN || tid == INT_MAX) goto out_free_threads; if (tid == prev_tid) -- cgit v1.2.3 From 297c9d96e3085116c5cde18170dba716a1f2591e Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 21 Jan 2026 16:19:40 +0000 Subject: perf jevents: Handle deleted JSONS in out of source builds Make the source folders a dependency for the generated folder root so that whenever a file is deleted from the source it will force a new fresh copy of all the JSON files and avoid stale deleted files. JSON_DIRS_OUTPUT_ROOT needs to be a dependency of LEGACY_CACHE_JSON so that the root folder doesn't get cleaned after the legacy JSON is generated. But this is a no-op with in-source builds as JSON_DIRS_OUTPUT_ROOT is unset. JSON_DIRS is added as a dependency of PMU_EVENTS_C which also forces a re-build for in source builds when JSON files are deleted. This could have also resulted in stale builds, but never a broken one. Closes: https://lore.kernel.org/linux-next/aW5XSAo88_LBPSYI@sirena.org.uk/ Fixes: 4bb55de4ff03db3e ("perf jevents: Support copying the source json files to OUTPUT") Reported-by: Mark Brown Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/Build | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index a46ab7b612df..4f9ef624ba70 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -1,5 +1,6 @@ pmu-events-y += pmu-events.o JSON = $(shell find pmu-events/arch -name '*.json' -o -name '*.csv') +JSON_DIRS = $(shell find pmu-events/arch -type d) JDIR_TEST = pmu-events/arch/test JSON_TEST = $(shell [ -d $(JDIR_TEST) ] && \ find $(JDIR_TEST) -name '*.json') @@ -31,16 +32,23 @@ $(PMU_EVENTS_C): $(EMPTY_PMU_EVENTS_C) else # Copy checked-in json to OUTPUT for generation if it's an out of source build ifneq ($(OUTPUT),) -$(OUTPUT)pmu-events/arch/%: pmu-events/arch/% +# Remove all output directories when any source directory timestamp changes +# so there are no stale deleted files +JSON_DIRS_ROOT = $(OUTPUT)pmu-events/arch/ +$(JSON_DIRS_ROOT): $(JSON_DIRS) + $(Q)$(call echo-cmd,gen)rm -rf $@ + $(Q)mkdir -p $@ + +$(OUTPUT)pmu-events/arch/%: pmu-events/arch/% $(JSON_DIRS_ROOT) $(call rule_mkdir) $(Q)$(call echo-cmd,gen)cp $< $@ endif -$(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) +$(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT) $(call rule_mkdir) $(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@ -GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON) +GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON) $(JSON_DIRS) $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY) $(call rule_mkdir) -- cgit v1.2.3 From cc4448d0856d424e52b5f53b2592575598233eac Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Thu, 22 Jan 2026 22:26:03 -0500 Subject: tools/sched_ext: add scx_userland scheduler Add in the scx_userland scheduler that does vruntime-based scheduling in userspace code and communicates scheduling decisions to BPF by accessing and modifying globals through the skeleton. Cc: Tejun Heo Cc: David Vernet Signed-off-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- tools/sched_ext/Makefile | 2 +- tools/sched_ext/scx_userland.bpf.c | 344 +++++++++++++++++++++++++++++ tools/sched_ext/scx_userland.c | 437 +++++++++++++++++++++++++++++++++++++ tools/sched_ext/scx_userland.h | 17 ++ 4 files changed, 799 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/scx_userland.bpf.c create mode 100644 tools/sched_ext/scx_userland.c create mode 100644 tools/sched_ext/scx_userland.h (limited to 'tools') diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index e4bda2474060..12043a82a1a9 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c new file mode 100644 index 000000000000..f29862b89386 --- /dev/null +++ b/tools/sched_ext/scx_userland.bpf.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal userland scheduler. + * + * In terms of scheduling, this provides two different types of behaviors: + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. + * All such tasks are direct-dispatched from the kernel, and are never + * enqueued in user space. + * 2. A primitive vruntime scheduler that is implemented in user space, for all + * other tasks. + * + * Some parts of this example user space scheduler could be implemented more + * efficiently using more complex and sophisticated data structures. For + * example, rather than using BPF_MAP_TYPE_QUEUE's, + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between + * user space and kernel space. Similarly, we use a simple vruntime-sorted list + * in user space, but an rbtree could be used instead. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include "scx_userland.h" + +/* + * Maximum amount of tasks enqueued/dispatched between kernel and user-space. + */ +#define MAX_ENQUEUED_TASKS 4096 + +char _license[] SEC("license") = "GPL"; + +const volatile s32 usersched_pid; + +/* !0 for veristat, set during init */ +const volatile u32 num_possible_cpus = 64; + +/* Stats that are printed by user space. */ +u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; + +/* + * Number of tasks that are queued for scheduling. + * + * This number is incremented by the BPF component when a task is queued to the + * user-space scheduler and it must be decremented by the user-space scheduler + * when a task is consumed. + */ +volatile u64 nr_queued; + +/* + * Number of tasks that are waiting for scheduling. + * + * This number must be updated by the user-space scheduler to keep track if + * there is still some scheduling work to do. + */ +volatile u64 nr_scheduled; + +UEI_DEFINE(uei); + +/* + * The map containing tasks that are enqueued in user space from the kernel. + * + * This map is drained by the user space scheduler. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(value, struct scx_userland_enqueued_task); +} enqueued SEC(".maps"); + +/* + * The map containing tasks that are dispatched to the kernel from user space. + * + * Drained by the kernel in userland_dispatch(). + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(value, s32); +} dispatched SEC(".maps"); + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ */ +}; + +/* Map that contains task-local storage. */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* + * Flag used to wake-up the user-space scheduler. + */ +static volatile u32 usersched_needed; + +/* + * Set user-space scheduler wake-up flag (equivalent to an atomic release + * operation). + */ +static void set_usersched_needed(void) +{ + __sync_fetch_and_or(&usersched_needed, 1); +} + +/* + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic + * acquire operation). + */ +static bool test_and_clear_usersched_needed(void) +{ + return __sync_fetch_and_and(&usersched_needed, 0) == 1; +} + +static bool is_usersched_task(const struct task_struct *p) +{ + return p->pid == usersched_pid; +} + +static bool keep_in_kernel(const struct task_struct *p) +{ + return p->nr_cpus_allowed < num_possible_cpus; +} + +static struct task_struct *usersched_task(void) +{ + struct task_struct *p; + + p = bpf_task_from_pid(usersched_pid); + /* + * Should never happen -- the usersched task should always be managed + * by sched_ext. + */ + if (!p) + scx_bpf_error("Failed to find usersched task %d", usersched_pid); + + return p; +} + +s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + if (keep_in_kernel(p)) { + s32 cpu; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + tctx->force_local = true; + return cpu; + } + } + + return prev_cpu; +} + +static void dispatch_user_scheduler(void) +{ + struct task_struct *p; + + p = usersched_task(); + if (p) { + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) +{ + struct scx_userland_enqueued_task task = {}; + + task.pid = p->pid; + task.sum_exec_runtime = p->se.sum_exec_runtime; + task.weight = p->scx.weight; + + if (bpf_map_push_elem(&enqueued, &task, 0)) { + /* + * If we fail to enqueue the task in user space, put it + * directly on the global DSQ. + */ + __sync_fetch_and_add(&nr_failed_enqueues, 1); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } else { + __sync_fetch_and_add(&nr_user_enqueues, 1); + set_usersched_needed(); + } +} + +void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (keep_in_kernel(p)) { + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); + return; + } + + if (tctx->force_local) + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&nr_kernel_enqueues, 1); + return; + } else if (!is_usersched_task(p)) { + enqueue_task_in_user_space(p, enq_flags); + } +} + +void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) +{ + if (test_and_clear_usersched_needed()) + dispatch_user_scheduler(); + + bpf_repeat(MAX_ENQUEUED_TASKS) { + s32 pid; + struct task_struct *p; + + if (bpf_map_pop_elem(&dispatched, &pid)) + break; + + /* + * The task could have exited by the time we get around to + * dispatching it. Treat this as a normal occurrence, and simply + * move onto the next iteration. + */ + p = bpf_task_from_pid(pid); + if (!p) + continue; + + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +/* + * A CPU is about to change its idle state. If the CPU is going idle, ensure + * that the user-space scheduler has a chance to run if there is any remaining + * work to do. + */ +void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) +{ + /* + * Don't do anything if we exit from and idle state, a CPU owner will + * be assigned in .running(). + */ + if (!idle) + return; + /* + * A CPU is now available, notify the user-space scheduler that tasks + * can be dispatched, if there is at least one task waiting to be + * scheduled, either queued (accounted in nr_queued) or scheduled + * (accounted in nr_scheduled). + * + * NOTE: nr_queued is incremented by the BPF component, more exactly in + * enqueue(), when a task is sent to the user-space scheduler, then + * the scheduler drains the queued tasks (updating nr_queued) and adds + * them to its internal data structures / state; at this point tasks + * become "scheduled" and the user-space scheduler will take care of + * updating nr_scheduled accordingly; lastly tasks will be dispatched + * and the user-space scheduler will update nr_scheduled again. + * + * Checking both counters allows to determine if there is still some + * pending work to do for the scheduler: new tasks have been queued + * since last check, or there are still tasks "queued" or "scheduled" + * since the previous user-space scheduler run. If the counters are + * both zero it is pointless to wake-up the scheduler (even if a CPU + * becomes idle), because there is nothing to do. + * + * Keep in mind that update_idle() doesn't run concurrently with the + * user-space scheduler (that is single-threaded): this function is + * naturally serialized with the user-space scheduler code, therefore + * this check here is also safe from a concurrency perspective. + */ + if (nr_queued || nr_scheduled) { + /* + * Kick the CPU to make it immediately ready to accept + * dispatched tasks. + */ + set_usersched_needed(); + scx_bpf_kick_cpu(cpu, 0); + } +} + +s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(userland_init) +{ + if (num_possible_cpus == 0) { + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", + num_possible_cpus); + return -EINVAL; + } + + if (usersched_pid <= 0) { + scx_bpf_error("User scheduler pid uninitialized (%d)", + usersched_pid); + return -EINVAL; + } + + return 0; +} + +void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(userland_ops, + .select_cpu = (void *)userland_select_cpu, + .enqueue = (void *)userland_enqueue, + .dispatch = (void *)userland_dispatch, + .update_idle = (void *)userland_update_idle, + .init_task = (void *)userland_init_task, + .init = (void *)userland_init, + .exit = (void *)userland_exit, + .flags = SCX_OPS_ENQ_LAST | + SCX_OPS_KEEP_BUILTIN_IDLE, + .name = "userland"); diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c new file mode 100644 index 000000000000..10b31020f44f --- /dev/null +++ b/tools/sched_ext/scx_userland.c @@ -0,0 +1,437 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext user space scheduler which provides vruntime semantics + * using a simple ordered-list implementation. + * + * Each CPU in the system resides in a single, global domain. This precludes + * the need to do any load balancing between domains. The scheduler could + * easily be extended to support multiple domains, with load balancing + * happening in user space. + * + * Any task which has any CPU affinity is scheduled entirely in BPF. This + * program only schedules tasks which may run on any CPU. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "scx_userland.h" +#include "scx_userland.bpf.skel.h" + +const char help_fmt[] = +"A minimal userland sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" +"\n" +"Usage: %s [-b BATCH]\n" +"\n" +" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +/* Defined in UAPI */ +#define SCHED_EXT 7 + +/* Number of tasks to batch when dispatching to user space. */ +static __u32 batch_size = 8; + +static bool verbose; +static volatile int exit_req; +static int enqueued_fd, dispatched_fd; + +static struct scx_userland *skel; +static struct bpf_link *ops_link; + +/* Stats collected in user space. */ +static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; + +/* Number of tasks currently enqueued. */ +static __u64 nr_curr_enqueued; + +/* The data structure containing tasks that are enqueued in user space. */ +struct enqueued_task { + LIST_ENTRY(enqueued_task) entries; + __u64 sum_exec_runtime; + double vruntime; +}; + +/* + * Use a vruntime-sorted list to store tasks. This could easily be extended to + * a more optimal data structure, such as an rbtree as is done in CFS. We + * currently elect to use a sorted list to simplify the example for + * illustrative purposes. + */ +LIST_HEAD(listhead, enqueued_task); + +/* + * A vruntime-sorted list of tasks. The head of the list contains the task with + * the lowest vruntime. That is, the task that has the "highest" claim to be + * scheduled. + */ +static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); + +/* + * The main array of tasks. The array is allocated all at once during + * initialization, based on /proc/sys/kernel/pid_max, to avoid having to + * dynamically allocate memory on the enqueue path, which could cause a + * deadlock. A more substantive user space scheduler could e.g. provide a hook + * for newly enabled tasks that are passed to the scheduler from the + * .prep_enable() callback to allows the scheduler to allocate on safe paths. + */ +struct enqueued_task *tasks; +static int pid_max; + +static double min_vruntime; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int userland) +{ + exit_req = 1; +} + +static int get_pid_max(void) +{ + FILE *fp; + int pid_max; + + fp = fopen("/proc/sys/kernel/pid_max", "r"); + if (fp == NULL) { + fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); + return -1; + } + if (fscanf(fp, "%d", &pid_max) != 1) { + fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); + fclose(fp); + return -1; + } + fclose(fp); + + return pid_max; +} + +static int init_tasks(void) +{ + pid_max = get_pid_max(); + if (pid_max < 0) + return pid_max; + + tasks = calloc(pid_max, sizeof(*tasks)); + if (!tasks) { + fprintf(stderr, "Error allocating tasks array\n"); + return -ENOMEM; + } + + return 0; +} + +static __u32 task_pid(const struct enqueued_task *task) +{ + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); +} + +static int dispatch_task(__s32 pid) +{ + int err; + + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); + if (err) { + nr_vruntime_failed++; + } else { + nr_vruntime_dispatches++; + } + + return err; +} + +static struct enqueued_task *get_enqueued_task(__s32 pid) +{ + if (pid >= pid_max) + return NULL; + + return &tasks[pid]; +} + +static double calc_vruntime_delta(__u64 weight, __u64 delta) +{ + double weight_f = (double)weight / 100.0; + double delta_f = (double)delta; + + return delta_f / weight_f; +} + +static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) +{ + __u64 delta; + + delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; + + enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); + if (min_vruntime > enqueued->vruntime) + enqueued->vruntime = min_vruntime; + enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; +} + +static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) +{ + struct enqueued_task *curr, *enqueued, *prev; + + curr = get_enqueued_task(bpf_task->pid); + if (!curr) + return ENOENT; + + update_enqueued(curr, bpf_task); + nr_vruntime_enqueues++; + nr_curr_enqueued++; + + /* + * Enqueue the task in a vruntime-sorted list. A more optimal data + * structure such as an rbtree could easily be used as well. We elect + * to use a list here simply because it's less code, and thus the + * example is less convoluted and better serves to illustrate what a + * user space scheduler could look like. + */ + + if (LIST_EMPTY(&vruntime_head)) { + LIST_INSERT_HEAD(&vruntime_head, curr, entries); + return 0; + } + + LIST_FOREACH(enqueued, &vruntime_head, entries) { + if (curr->vruntime <= enqueued->vruntime) { + LIST_INSERT_BEFORE(enqueued, curr, entries); + return 0; + } + prev = enqueued; + } + + LIST_INSERT_AFTER(prev, curr, entries); + + return 0; +} + +static void drain_enqueued_map(void) +{ + while (1) { + struct scx_userland_enqueued_task task; + int err; + + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { + skel->bss->nr_queued = 0; + skel->bss->nr_scheduled = nr_curr_enqueued; + return; + } + + err = vruntime_enqueue(&task); + if (err) { + fprintf(stderr, "Failed to enqueue task %d: %s\n", + task.pid, strerror(err)); + exit_req = 1; + return; + } + } +} + +static void dispatch_batch(void) +{ + __u32 i; + + for (i = 0; i < batch_size; i++) { + struct enqueued_task *task; + int err; + __s32 pid; + + task = LIST_FIRST(&vruntime_head); + if (!task) + break; + + min_vruntime = task->vruntime; + pid = task_pid(task); + LIST_REMOVE(task, entries); + err = dispatch_task(pid); + if (err) { + /* + * If we fail to dispatch, put the task back to the + * vruntime_head list and stop dispatching additional + * tasks in this batch. + */ + LIST_INSERT_HEAD(&vruntime_head, task, entries); + break; + } + nr_curr_enqueued--; + } + skel->bss->nr_scheduled = nr_curr_enqueued; +} + +static void *run_stats_printer(void *arg) +{ + while (!exit_req) { + __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; + + nr_failed_enqueues = skel->bss->nr_failed_enqueues; + nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; + nr_user_enqueues = skel->bss->nr_user_enqueues; + total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; + + printf("o-----------------------o\n"); + printf("| BPF ENQUEUES |\n"); + printf("|-----------------------|\n"); + printf("| kern: %10llu |\n", nr_kernel_enqueues); + printf("| user: %10llu |\n", nr_user_enqueues); + printf("| failed: %10llu |\n", nr_failed_enqueues); + printf("| -------------------- |\n"); + printf("| total: %10llu |\n", total); + printf("| |\n"); + printf("|-----------------------|\n"); + printf("| VRUNTIME / USER |\n"); + printf("|-----------------------|\n"); + printf("| enq: %10llu |\n", nr_vruntime_enqueues); + printf("| disp: %10llu |\n", nr_vruntime_dispatches); + printf("| failed: %10llu |\n", nr_vruntime_failed); + printf("o-----------------------o\n"); + printf("\n\n"); + fflush(stdout); + sleep(1); + } + + return NULL; +} + +static int spawn_stats_thread(void) +{ + pthread_t stats_printer; + + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); +} + +static void pre_bootstrap(int argc, char **argv) +{ + int err; + __u32 opt; + struct sched_param sched_param = { + .sched_priority = sched_get_priority_max(SCHED_EXT), + }; + + err = init_tasks(); + if (err) + exit(err); + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + /* + * Enforce that the user scheduler task is managed by sched_ext. The + * task eagerly drains the list of enqueued tasks in its main work + * loop, and then yields the CPU. The BPF scheduler only schedules the + * user space scheduler task when at least one other task in the system + * needs to be scheduled. + */ + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); + SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); + + while ((opt = getopt(argc, argv, "b:vh")) != -1) { + switch (opt) { + case 'b': + batch_size = strtoul(optarg, NULL, 0); + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + exit(opt != 'h'); + } + } + + /* + * It's not always safe to allocate in a user space scheduler, as an + * enqueued task could hold a lock that we require in order to be able + * to allocate. + */ + err = mlockall(MCL_CURRENT | MCL_FUTURE); + SCX_BUG_ON(err, "Failed to prefault and lock address space"); +} + +static void bootstrap(char *comm) +{ + skel = SCX_OPS_OPEN(userland_ops, scx_userland); + + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->num_possible_cpus > 0); + skel->rodata->usersched_pid = getpid(); + assert(skel->rodata->usersched_pid > 0); + + SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei); + + enqueued_fd = bpf_map__fd(skel->maps.enqueued); + dispatched_fd = bpf_map__fd(skel->maps.dispatched); + assert(enqueued_fd > 0); + assert(dispatched_fd > 0); + + SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); + + ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland); +} + +static void sched_main_loop(void) +{ + while (!exit_req) { + /* + * Perform the following work in the main user space scheduler + * loop: + * + * 1. Drain all tasks from the enqueued map, and enqueue them + * to the vruntime sorted list. + * + * 2. Dispatch a batch of tasks from the vruntime sorted list + * down to the kernel. + * + * 3. Yield the CPU back to the system. The BPF scheduler will + * reschedule the user space scheduler once another task has + * been enqueued to user space. + */ + drain_enqueued_map(); + dispatch_batch(); + sched_yield(); + } +} + +int main(int argc, char **argv) +{ + __u64 ecode; + + pre_bootstrap(argc, argv); +restart: + bootstrap(argv[0]); + sched_main_loop(); + + exit_req = 1; + bpf_link__destroy(ops_link); + ecode = UEI_REPORT(skel, uei); + scx_userland__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h new file mode 100644 index 000000000000..684fb2dd5de9 --- /dev/null +++ b/tools/sched_ext/scx_userland.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta, Inc */ + +#ifndef __SCX_USERLAND_COMMON_H +#define __SCX_USERLAND_COMMON_H + +/* + * An instance of a task that has been enqueued by the kernel for consumption + * by a user space global scheduler thread. + */ +struct scx_userland_enqueued_task { + __s32 pid; + u64 sum_exec_runtime; + u64 weight; +}; + +#endif // __SCX_USERLAND_COMMON_H -- cgit v1.2.3 From f0262b102c7ce43f3744bdb0278ddf0d15bb1a71 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Thu, 22 Jan 2026 22:26:04 -0500 Subject: tools/sched_ext: add scx_pair scheduler Add the scx_pair cgroup-based core scheduler. Cc: Tejun Heo Cc: David Vernet Signed-off-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- tools/sched_ext/Makefile | 2 +- tools/sched_ext/scx_pair.bpf.c | 610 +++++++++++++++++++++++++++++++++++++++++ tools/sched_ext/scx_pair.c | 180 ++++++++++++ tools/sched_ext/scx_pair.h | 9 + 4 files changed, 800 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/scx_pair.bpf.c create mode 100644 tools/sched_ext/scx_pair.c create mode 100644 tools/sched_ext/scx_pair.h (limited to 'tools') diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index 12043a82a1a9..208e8f8fe4d8 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c new file mode 100644 index 000000000000..267011b57cba --- /dev/null +++ b/tools/sched_ext/scx_pair.bpf.c @@ -0,0 +1,610 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext core-scheduler which always makes every sibling CPU pair + * execute from the same CPU cgroup. + * + * This scheduler is a minimal implementation and would need some form of + * priority handling both inside each cgroup and across the cgroups to be + * practically useful. + * + * Each CPU in the system is paired with exactly one other CPU, according to a + * "stride" value that can be specified when the BPF scheduler program is first + * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee + * that they will only ever schedule tasks that belong to the same CPU cgroup. + * + * Scheduler Initialization + * ------------------------ + * + * The scheduler BPF program is first initialized from user space, before it is + * enabled. During this initialization process, each CPU on the system is + * assigned several values that are constant throughout its runtime: + * + * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling + * decisions. Paired CPUs always schedule tasks from the same + * CPU cgroup, and synchronize with each other to guarantee + * that this constraint is not violated. + * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access + * a struct pair_ctx object that is shared between the pair. + * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the + * pair. Each struct pair_ctx has an active_mask field, + * which is a bitmap used to indicate whether each core + * in the pair currently has an actively running task. + * This index specifies which entry in the bitmap corresponds + * to each CPU in the pair. + * + * During this initialization, the CPUs are paired according to a "stride" that + * may be specified when invoking the user space program that initializes and + * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. + * + * Tasks and cgroups + * ----------------- + * + * Every cgroup in the system is registered with the scheduler using the + * pair_cgroup_init() callback, and every task in the system is associated with + * exactly one cgroup. At a high level, the idea with the pair scheduler is to + * always schedule tasks from the same cgroup within a given CPU pair. When a + * task is enqueued (i.e. passed to the pair_enqueue() callback function), its + * cgroup ID is read from its task struct, and then a corresponding queue map + * is used to FIFO-enqueue the task for that cgroup. + * + * If you look through the implementation of the scheduler, you'll notice that + * there is quite a bit of complexity involved with looking up the per-cgroup + * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash + * BPF hash map that is used to map a cgroup ID to a globally unique ID that's + * allocated in the BPF program. This is done because we use separate maps to + * store the FIFO queue of tasks, and the length of that map, per cgroup. This + * complexity is only present because of current deficiencies in BPF that will + * soon be addressed. The main point to keep in mind is that newly enqueued + * tasks are added to their cgroup's FIFO queue. + * + * Dispatching tasks + * ----------------- + * + * This section will describe how enqueued tasks are dispatched and scheduled. + * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is + * as follows: + * + * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is + * the structure that's used to synchronize amongst the two pair CPUs in their + * scheduling decisions. After any of the following events have occurred: + * + * - The cgroup's slice run has expired, or + * - The cgroup becomes empty, or + * - Either CPU in the pair is preempted by a higher priority scheduling class + * + * The cgroup transitions to the draining state and stops executing new tasks + * from the cgroup. + * + * 2. If the pair is still executing a task, mark the pair_ctx as draining, and + * wait for the pair CPU to be preempted. + * + * 3. Otherwise, if the pair CPU is not running a task, we can move onto + * scheduling new tasks. Pop the next cgroup id from the top_q queue. + * + * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. + * + * Note again that this scheduling behavior is simple, but the implementation + * is complex mostly because this it hits several BPF shortcomings and has to + * work around in often awkward ways. Most of the shortcomings are expected to + * be resolved in the near future which should allow greatly simplifying this + * scheduler. + * + * Dealing with preemption + * ----------------------- + * + * SCX is the lowest priority sched_class, and could be preempted by them at + * any time. To address this, the scheduler implements pair_cpu_release() and + * pair_cpu_acquire() callbacks which are invoked by the core scheduler when + * the scheduler loses and gains control of the CPU respectively. + * + * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and + * then invoke: + * + * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); + * + * This preempts the pair CPU, and waits until it has re-entered the scheduler + * before returning. This is necessary to ensure that the higher priority + * sched_class that preempted our scheduler does not schedule a task + * concurrently with our pair CPU. + * + * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption + * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable + * pair scheduling. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include "scx_pair.h" + +char _license[] SEC("license") = "GPL"; + +/* !0 for veristat, set during init */ +const volatile u32 nr_cpu_ids = 1; + +/* a pair of CPUs stay on a cgroup for this duration */ +const volatile u32 pair_batch_dur_ns; + +/* cpu ID -> pair cpu ID */ +const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); + +/* cpu ID -> pair_id */ +const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); + +/* CPU ID -> CPU # in the pair (0 or 1) */ +const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); + +struct pair_ctx { + struct bpf_spin_lock lock; + + /* the cgroup the pair is currently executing */ + u64 cgid; + + /* the pair started executing the current cgroup at */ + u64 started_at; + + /* whether the current cgroup is draining */ + bool draining; + + /* the CPUs that are currently active on the cgroup */ + u32 active_mask; + + /* + * the CPUs that are currently preempted and running tasks in a + * different scheduler. + */ + u32 preempted_mask; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct pair_ctx); +} pair_ctx SEC(".maps"); + +/* queue of cgrp_q's possibly with tasks on them */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + /* + * Because it's difficult to build strong synchronization encompassing + * multiple non-trivial operations in BPF, this queue is managed in an + * opportunistic way so that we guarantee that a cgroup w/ active tasks + * is always on it but possibly multiple times. Once we have more robust + * synchronization constructs and e.g. linked list, we should be able to + * do this in a prettier way but for now just size it big enough. + */ + __uint(max_entries, 4 * MAX_CGRPS); + __type(value, u64); +} top_q SEC(".maps"); + +/* per-cgroup q which FIFOs the tasks from the cgroup */ +struct cgrp_q { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_QUEUED); + __type(value, u32); +}; + +/* + * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local + * storage; however, a cgroup local storage can only be accessed from the BPF + * progs attached to the cgroup. For now, work around by allocating array of + * cgrp_q's and then allocating per-cgroup indices. + * + * Another caveat: It's difficult to populate a large array of maps statically + * or from BPF. Initialize it from userland. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_CGRPS); + __type(key, s32); + __array(values, struct cgrp_q); +} cgrp_q_arr SEC(".maps"); + +static u64 cgrp_q_len[MAX_CGRPS]; + +/* + * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be + * useful to have as a map type. + */ +static u32 cgrp_q_idx_cursor; +static u64 cgrp_q_idx_busy[MAX_CGRPS]; + +/* + * All added up, the following is what we do: + * + * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking + * for a free ID. If not found, fail cgroup creation with -EBUSY. + * + * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following + * cgrp_q_idx_hash. + * + * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from + * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. + * + * This is sadly complicated for something pretty simple. Hopefully, we should + * be able to simplify in the future. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_CGRPS); + __uint(key_size, sizeof(u64)); /* cgrp ID */ + __uint(value_size, sizeof(s32)); /* cgrp_q idx */ +} cgrp_q_idx_hash SEC(".maps"); + +/* statistics */ +u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; +u64 nr_exps, nr_exp_waits, nr_exp_empty; +u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct cgroup *cgrp; + struct cgrp_q *cgq; + s32 pid = p->pid; + u64 cgid; + u32 *q_idx; + u64 *cgq_len; + + __sync_fetch_and_add(&nr_total, 1); + + cgrp = scx_bpf_task_cgroup(p); + cgid = cgrp->kn->id; + bpf_cgroup_release(cgrp); + + /* find the cgroup's q and push @p into it */ + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (!q_idx) { + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); + return; + } + + cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); + if (!cgq) { + scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", + cgid, *q_idx); + return; + } + + if (bpf_map_push_elem(cgq, &pid, 0)) { + scx_bpf_error("cgroup[%llu] queue overflow", cgid); + return; + } + + /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); + if (!cgq_len) { + scx_bpf_error("MEMBER_VTPR malfunction"); + return; + } + + if (!__sync_fetch_and_add(cgq_len, 1) && + bpf_map_push_elem(&top_q, &cgid, 0)) { + scx_bpf_error("top_q overflow"); + return; + } +} + +static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) +{ + u32 *vptr; + + vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); + if (!vptr) + return -EINVAL; + + *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); + if (!(*pairc)) + return -EINVAL; + + vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); + if (!vptr) + return -EINVAL; + + *mask = 1U << *vptr; + + return 0; +} + +__attribute__((noinline)) +static int try_dispatch(s32 cpu) +{ + struct pair_ctx *pairc; + struct bpf_map *cgq_map; + struct task_struct *p; + u64 now = scx_bpf_now(); + bool kick_pair = false; + bool expired, pair_preempted; + u32 *vptr, in_pair_mask; + s32 pid, q_idx; + u64 cgid; + int ret; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) { + scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", + cpu); + return -ENOENT; + } + + bpf_spin_lock(&pairc->lock); + pairc->active_mask &= ~in_pair_mask; + + expired = time_before(pairc->started_at + pair_batch_dur_ns, now); + if (expired || pairc->draining) { + u64 new_cgid = 0; + + __sync_fetch_and_add(&nr_exps, 1); + + /* + * We're done with the current cgid. An obvious optimization + * would be not draining if the next cgroup is the current one. + * For now, be dumb and always expire. + */ + pairc->draining = true; + + pair_preempted = pairc->preempted_mask; + if (pairc->active_mask || pair_preempted) { + /* + * The other CPU is still active, or is no longer under + * our control due to e.g. being preempted by a higher + * priority sched_class. We want to wait until this + * cgroup expires, or until control of our pair CPU has + * been returned to us. + * + * If the pair controls its CPU, and the time already + * expired, kick. When the other CPU arrives at + * dispatch and clears its active mask, it'll push the + * pair to the next cgroup and kick this CPU. + */ + __sync_fetch_and_add(&nr_exp_waits, 1); + bpf_spin_unlock(&pairc->lock); + if (expired && !pair_preempted) + kick_pair = true; + goto out_maybe_kick; + } + + bpf_spin_unlock(&pairc->lock); + + /* + * Pick the next cgroup. It'd be easier / cleaner to not drop + * pairc->lock and use stronger synchronization here especially + * given that we'll be switching cgroups significantly less + * frequently than tasks. Unfortunately, bpf_spin_lock can't + * really protect anything non-trivial. Let's do opportunistic + * operations instead. + */ + bpf_repeat(BPF_MAX_LOOPS) { + u32 *q_idx; + u64 *cgq_len; + + if (bpf_map_pop_elem(&top_q, &new_cgid)) { + /* no active cgroup, go idle */ + __sync_fetch_and_add(&nr_exp_empty, 1); + return 0; + } + + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); + if (!q_idx) + continue; + + /* + * This is the only place where empty cgroups are taken + * off the top_q. + */ + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); + if (!cgq_len || !*cgq_len) + continue; + + /* + * If it has any tasks, requeue as we may race and not + * execute it. + */ + bpf_map_push_elem(&top_q, &new_cgid, 0); + break; + } + + bpf_spin_lock(&pairc->lock); + + /* + * The other CPU may already have started on a new cgroup while + * we dropped the lock. Make sure that we're still draining and + * start on the new cgroup. + */ + if (pairc->draining && !pairc->active_mask) { + __sync_fetch_and_add(&nr_cgrp_next, 1); + pairc->cgid = new_cgid; + pairc->started_at = now; + pairc->draining = false; + kick_pair = true; + } else { + __sync_fetch_and_add(&nr_cgrp_coll, 1); + } + } + + cgid = pairc->cgid; + pairc->active_mask |= in_pair_mask; + bpf_spin_unlock(&pairc->lock); + + /* again, it'd be better to do all these with the lock held, oh well */ + vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (!vptr) { + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); + return -ENOENT; + } + q_idx = *vptr; + + /* claim one task from cgrp_q w/ q_idx */ + bpf_repeat(BPF_MAX_LOOPS) { + u64 *cgq_len, len; + + cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); + if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { + /* the cgroup must be empty, expire and repeat */ + __sync_fetch_and_add(&nr_cgrp_empty, 1); + bpf_spin_lock(&pairc->lock); + pairc->draining = true; + pairc->active_mask &= ~in_pair_mask; + bpf_spin_unlock(&pairc->lock); + return -EAGAIN; + } + + if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) + continue; + + break; + } + + cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); + if (!cgq_map) { + scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", + cgid, q_idx); + return -ENOENT; + } + + if (bpf_map_pop_elem(cgq_map, &pid)) { + scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", + cgid, q_idx); + return -ENOENT; + } + + p = bpf_task_from_pid(pid); + if (p) { + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } else { + /* we don't handle dequeues, retry on lost tasks */ + __sync_fetch_and_add(&nr_missing, 1); + return -EAGAIN; + } + +out_maybe_kick: + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); + } + } + return 0; +} + +void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) +{ + bpf_repeat(BPF_MAX_LOOPS) { + if (try_dispatch(cpu) != -EAGAIN) + break; + } +} + +void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) +{ + int ret; + u32 in_pair_mask; + struct pair_ctx *pairc; + bool kick_pair; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) + return; + + bpf_spin_lock(&pairc->lock); + pairc->preempted_mask &= ~in_pair_mask; + /* Kick the pair CPU, unless it was also preempted. */ + kick_pair = !pairc->preempted_mask; + bpf_spin_unlock(&pairc->lock); + + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); + } + } +} + +void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + int ret; + u32 in_pair_mask; + struct pair_ctx *pairc; + bool kick_pair; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) + return; + + bpf_spin_lock(&pairc->lock); + pairc->preempted_mask |= in_pair_mask; + pairc->active_mask &= ~in_pair_mask; + /* Kick the pair CPU if it's still running. */ + kick_pair = pairc->active_mask; + pairc->draining = true; + bpf_spin_unlock(&pairc->lock); + + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); + } + } + __sync_fetch_and_add(&nr_preemptions, 1); +} + +s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + s32 i, q_idx; + + bpf_for(i, 0, MAX_CGRPS) { + q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; + if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) + break; + } + if (i == MAX_CGRPS) + return -EBUSY; + + if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); + if (busy) + *busy = 0; + return -EBUSY; + } + + return 0; +} + +void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + s32 *q_idx; + + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (q_idx) { + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); + if (busy) + *busy = 0; + bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); + } +} + +void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(pair_ops, + .enqueue = (void *)pair_enqueue, + .dispatch = (void *)pair_dispatch, + .cpu_acquire = (void *)pair_cpu_acquire, + .cpu_release = (void *)pair_cpu_release, + .cgroup_init = (void *)pair_cgroup_init, + .cgroup_exit = (void *)pair_cgroup_exit, + .exit = (void *)pair_exit, + .name = "pair"); diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c new file mode 100644 index 000000000000..d3e97faa6334 --- /dev/null +++ b/tools/sched_ext/scx_pair.c @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_pair.h" +#include "scx_pair.bpf.skel.h" + +const char help_fmt[] = +"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n" +"execute from the same CPU cgroup.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-S STRIDE]\n" +"\n" +" -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_pair *skel; + struct bpf_link *link; + __u64 seq = 0, ecode; + __s32 stride, i, opt, outer_fd; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(pair_ops, scx_pair); + + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpu_ids > 0); + skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + + /* pair up the earlier half to the latter by default, override with -s */ + stride = skel->rodata->nr_cpu_ids / 2; + + while ((opt = getopt(argc, argv, "S:vh")) != -1) { + switch (opt) { + case 'S': + stride = strtoul(optarg, NULL, 0); + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2); + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids); + + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) + skel->rodata_pair_cpu->pair_cpu[i] = -1; + + printf("Pairs: "); + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) { + int j = (i + stride) % skel->rodata->nr_cpu_ids; + + if (skel->rodata_pair_cpu->pair_cpu[i] >= 0) + continue; + + SCX_BUG_ON(i == j, + "Invalid stride %d - CPU%d wants to be its own pair", + stride, i); + + SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0, + "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair", + stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]); + + skel->rodata_pair_cpu->pair_cpu[i] = j; + skel->rodata_pair_cpu->pair_cpu[j] = i; + skel->rodata_pair_id->pair_id[i] = i; + skel->rodata_pair_id->pair_id[j] = i; + skel->rodata_in_pair_idx->in_pair_idx[i] = 0; + skel->rodata_in_pair_idx->in_pair_idx[j] = 1; + + printf("[%d, %d] ", i, j); + } + printf("\n"); + + SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei); + + /* + * Populate the cgrp_q_arr map which is an array containing per-cgroup + * queues. It'd probably be better to do this from BPF but there are too + * many to initialize statically and there's no way to dynamically + * populate from BPF. + */ + outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr); + SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd); + + printf("Initializing"); + for (i = 0; i < MAX_CGRPS; i++) { + __s32 inner_fd; + + if (exit_req) + break; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, + sizeof(__u32), MAX_QUEUED, NULL); + SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d", + inner_fd); + SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY), + "Failed to set inner map"); + close(inner_fd); + + if (!(i % 10)) + printf("."); + fflush(stdout); + } + printf("\n"); + + /* + * Fully initialized, attach and run. + */ + link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("[SEQ %llu]\n", seq++); + printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_dispatched, + skel->bss->nr_missing); + printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n", + skel->bss->nr_kicks, + skel->bss->nr_preemptions); + printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n", + skel->bss->nr_exps, + skel->bss->nr_exp_waits, + skel->bss->nr_exp_empty); + printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n", + skel->bss->nr_cgrp_next, + skel->bss->nr_cgrp_coll, + skel->bss->nr_cgrp_empty); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_pair__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h new file mode 100644 index 000000000000..d9666a447d3f --- /dev/null +++ b/tools/sched_ext/scx_pair.h @@ -0,0 +1,9 @@ +#ifndef __SCX_EXAMPLE_PAIR_H +#define __SCX_EXAMPLE_PAIR_H + +enum { + MAX_QUEUED = 4096, + MAX_CGRPS = 4096, +}; + +#endif /* __SCX_EXAMPLE_PAIR_H */ -- cgit v1.2.3 From 36929ebd17ae66ed3acde9056a9daf611d81a2e5 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Thu, 22 Jan 2026 22:26:05 -0500 Subject: tools/sched_ext: add arena based scheduler Add a scheduler that uses BPF arenas to manage task context data. Signed-off-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- tools/sched_ext/Makefile | 2 +- tools/sched_ext/scx_sdt.bpf.c | 710 ++++++++++++++++++++++++++++++++++++++++++ tools/sched_ext/scx_sdt.c | 101 ++++++ tools/sched_ext/scx_sdt.h | 113 +++++++ 4 files changed, 925 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/scx_sdt.bpf.c create mode 100644 tools/sched_ext/scx_sdt.c create mode 100644 tools/sched_ext/scx_sdt.h (limited to 'tools') diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index 208e8f8fe4d8..47ad7444677e 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c new file mode 100644 index 000000000000..48ea18614e28 --- /dev/null +++ b/tools/sched_ext/scx_sdt.bpf.c @@ -0,0 +1,710 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Arena-based task data scheduler. This is a variation of scx_simple + * that uses a combined allocator and indexing structure to organize + * task data. Task context allocation is done when a task enters the + * scheduler, while freeing is done when it exits. Task contexts are + * retrieved from task-local storage, pointing to the allocated memory. + * + * The main purpose of this scheduler is to demostrate arena memory + * management. + * + * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024-2025 Emil Tsalapatis + * Copyright (c) 2024-2025 Tejun Heo + * + */ +#include +#include + +#include "scx_sdt.h" + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) + __uint(max_entries, 1 << 16); /* number of pages */ + __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ +#else + __uint(max_entries, 1 << 20); /* number of pages */ + __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ +#endif +} arena __weak SEC(".maps"); + +#define SHARED_DSQ 0 + +#define DEFINE_SDT_STAT(metric) \ +static inline void \ +stat_inc_##metric(struct scx_stats __arena *stats) \ +{ \ + cast_kern(stats); \ + stats->metric += 1; \ +} \ +__u64 stat_##metric; \ + +DEFINE_SDT_STAT(enqueue); +DEFINE_SDT_STAT(init); +DEFINE_SDT_STAT(exit); +DEFINE_SDT_STAT(select_idle_cpu); +DEFINE_SDT_STAT(select_busy_cpu); + +/* + * Necessary for cond_break/can_loop's semantics. According to kernel commit + * 011832b, the loop counter variable must be seen as imprecise and bounded + * by the verifier. Initializing it from a constant (e.g., i = 0;), then, + * makes it precise and prevents may_goto from helping with converging the + * loop. For these loops we must initialize the loop counter from a variable + * whose value the verifier cannot reason about when checking the program, so + * that the loop counter's value is imprecise. + */ +static __u64 zero = 0; + +/* + * XXX Hack to get the verifier to find the arena for sdt_exit_task. + * As of 6.12-rc5, The verifier associates arenas with programs by + * checking LD.IMM instruction operands for an arena and populating + * the program state with the first instance it finds. This requires + * accessing our global arena variable, but scx methods do not necessarily + * do so while still using pointers from that arena. Insert a bpf_printk + * statement that triggers at most once to generate an LD.IMM instruction + * to access the arena and help the verifier. + */ +static volatile bool scx_arena_verify_once; + +__hidden void scx_arena_subprog_init(void) +{ + if (scx_arena_verify_once) + return; + + bpf_printk("%s: arena pointer %p", __func__, &arena); + scx_arena_verify_once = true; +} + + +private(LOCK) struct bpf_spin_lock alloc_lock; +private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; + +/* allocation pools */ +struct sdt_pool desc_pool; +struct sdt_pool chunk_pool; + +/* Protected by alloc_lock. */ +struct scx_alloc_stats alloc_stats; + + +/* Allocate element from the pool. Must be called with a then pool lock held. */ +static +void __arena *scx_alloc_from_pool(struct sdt_pool *pool) +{ + __u64 elem_size, max_elems; + void __arena *slab; + void __arena *ptr; + + elem_size = pool->elem_size; + max_elems = pool->max_elems; + + /* If the chunk is spent, get a new one. */ + if (pool->idx >= max_elems) { + slab = bpf_arena_alloc_pages(&arena, NULL, + div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); + if (!slab) + return NULL; + + pool->slab = slab; + pool->idx = 0; + } + + ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); + pool->idx += 1; + + return ptr; +} + +/* Alloc desc and associated chunk. Called with the allocator spinlock held. */ +static sdt_desc_t *scx_alloc_chunk(void) +{ + struct sdt_chunk __arena *chunk; + sdt_desc_t *desc; + sdt_desc_t *out; + + chunk = scx_alloc_from_pool(&chunk_pool); + if (!chunk) + return NULL; + + desc = scx_alloc_from_pool(&desc_pool); + if (!desc) { + /* + * Effectively frees the previous chunk allocation. + * Index cannot be 0, so decrementing is always + * valid. + */ + chunk_pool.idx -= 1; + return NULL; + } + + out = desc; + + desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; + desc->chunk = chunk; + + alloc_stats.chunk_allocs += 1; + + return out; +} + +static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) +{ + if (unlikely(data_size % 8)) + return -EINVAL; + + if (unlikely(nr_pages == 0)) + return -EINVAL; + + pool->elem_size = data_size; + pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; + /* Populate the pool slab on the first allocation. */ + pool->idx = pool->max_elems; + + return 0; +} + +/* Initialize both the base pool allocators and the root chunk of the index. */ +__hidden int +scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) +{ + size_t min_chunk_size; + int ret; + + _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, + "chunk size must fit into a page"); + + ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); + if (ret != 0) + return ret; + + ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); + if (ret != 0) + return ret; + + /* Wrap data into a descriptor and word align. */ + data_size += sizeof(struct sdt_data); + data_size = round_up(data_size, 8); + + /* + * Ensure we allocate large enough chunks from the arena to avoid excessive + * internal fragmentation when turning chunks it into structs. + */ + min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); + ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); + if (ret != 0) + return ret; + + bpf_spin_lock(&alloc_lock); + alloc->root = scx_alloc_chunk(); + bpf_spin_unlock(&alloc_lock); + if (!alloc->root) + return -ENOMEM; + + return 0; +} + +static +int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) +{ + __u64 __arena *allocated = desc->allocated; + __u64 bit; + + if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) + return -EINVAL; + + bit = (__u64)1 << (pos % 64); + + if (state) + allocated[pos / 64] |= bit; + else + allocated[pos / 64] &= ~bit; + + return 0; +} + +static __noinline +int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) +{ + sdt_desc_t *desc; + __u64 u, level; + int ret; + + for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { + level = SDT_TASK_LEVELS - 1 - u; + + /* Only propagate upwards if we are the parent's only free chunk. */ + desc = lv_desc[level]; + + ret = set_idx_state(desc, lv_pos[level], false); + if (unlikely(ret != 0)) + return ret; + + desc->nr_free += 1; + if (desc->nr_free > 1) + return 0; + } + + return 0; +} + +/* + * Free the allocated struct with the given index. Called with the + * allocator lock taken. + */ +__hidden +int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) +{ + const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; + sdt_desc_t * __arena *desc_children; + struct sdt_chunk __arena *chunk; + sdt_desc_t *desc; + struct sdt_data __arena *data; + __u64 level, shift, pos; + __u64 lv_pos[SDT_TASK_LEVELS]; + int ret; + int i; + + if (!alloc) + return 0; + + desc = alloc->root; + if (unlikely(!desc)) + return -EINVAL; + + /* To appease the verifier. */ + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + lv_desc[level] = NULL; + lv_pos[level] = 0; + } + + /* Find the leaf node containing the index. */ + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; + pos = (idx >> shift) & mask; + + lv_desc[level] = desc; + lv_pos[level] = pos; + + if (level == SDT_TASK_LEVELS - 1) + break; + + chunk = desc->chunk; + + desc_children = (sdt_desc_t * __arena *)chunk->descs; + desc = desc_children[pos]; + + if (unlikely(!desc)) + return -EINVAL; + } + + chunk = desc->chunk; + + pos = idx & mask; + data = chunk->data[pos]; + if (likely(data)) { + data[pos] = (struct sdt_data) { + .tid.genn = data->tid.genn + 1, + }; + + /* Zero out one word at a time. */ + for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { + data->payload[i] = 0; + } + } + + ret = mark_nodes_avail(lv_desc, lv_pos); + if (unlikely(ret != 0)) + return ret; + + alloc_stats.active_allocs -= 1; + alloc_stats.free_ops += 1; + + return 0; +} + +static inline +int ffs(__u64 word) +{ + unsigned int num = 0; + + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } + + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + + if ((word & 0xf) == 0) { + num += 4; + word >>= 4; + } + + if ((word & 0x3) == 0) { + num += 2; + word >>= 2; + } + + if ((word & 0x1) == 0) { + num += 1; + word >>= 1; + } + + return num; +} + + +/* find the first empty slot */ +__hidden +__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) +{ + __u64 freeslots; + __u64 i; + + for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { + freeslots = ~desc->allocated[i]; + if (freeslots == (__u64)0) + continue; + + return (i * 64) + ffs(freeslots); + } + + return SDT_TASK_ENTS_PER_CHUNK; +} + +/* + * Find and return an available idx on the allocator. + * Called with the task spinlock held. + */ +static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) +{ + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; + sdt_desc_t * __arena *desc_children; + struct sdt_chunk __arena *chunk; + sdt_desc_t *tmp; + __u64 lv_pos[SDT_TASK_LEVELS]; + __u64 u, pos, level; + __u64 idx = 0; + int ret; + + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + pos = chunk_find_empty(desc); + + /* If we error out, something has gone very wrong. */ + if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) + return NULL; + + if (pos == SDT_TASK_ENTS_PER_CHUNK) + return NULL; + + idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; + idx |= pos; + + /* Log the levels to complete allocation. */ + lv_desc[level] = desc; + lv_pos[level] = pos; + + /* The rest of the loop is for internal node traversal. */ + if (level == SDT_TASK_LEVELS - 1) + break; + + /* Allocate an internal node if necessary. */ + chunk = desc->chunk; + desc_children = (sdt_desc_t * __arena *)chunk->descs; + + desc = desc_children[pos]; + if (!desc) { + desc = scx_alloc_chunk(); + if (!desc) + return NULL; + + desc_children[pos] = desc; + } + } + + /* + * Finding the descriptor along with any internal node + * allocations was successful. Update all levels with + * the new allocation. + */ + bpf_for(u, 0, SDT_TASK_LEVELS) { + level = SDT_TASK_LEVELS - 1 - u; + tmp = lv_desc[level]; + + ret = set_idx_state(tmp, lv_pos[level], true); + if (ret != 0) + break; + + tmp->nr_free -= 1; + if (tmp->nr_free > 0) + break; + } + + *idxp = idx; + + return desc; +} + +__hidden +void __arena *scx_alloc(struct scx_allocator *alloc) +{ + struct sdt_data __arena *data = NULL; + struct sdt_chunk __arena *chunk; + sdt_desc_t *desc; + __u64 idx, pos; + + if (!alloc) + return NULL; + + bpf_spin_lock(&alloc_lock); + + /* We unlock if we encounter an error in the function. */ + desc = desc_find_empty(alloc->root, &idx); + if (unlikely(desc == NULL)) { + bpf_spin_unlock(&alloc_lock); + return NULL; + } + + chunk = desc->chunk; + + /* Populate the leaf node if necessary. */ + pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); + data = chunk->data[pos]; + if (!data) { + data = scx_alloc_from_pool(&alloc->pool); + if (!data) { + scx_alloc_free_idx(alloc, idx); + bpf_spin_unlock(&alloc_lock); + return NULL; + } + } + + chunk->data[pos] = data; + + /* The data counts as a chunk */ + alloc_stats.data_allocs += 1; + alloc_stats.alloc_ops += 1; + alloc_stats.active_allocs += 1; + + data->tid.idx = idx; + + bpf_spin_unlock(&alloc_lock); + + return data; +} + +/* + * Task BPF map entry recording the task's assigned ID and pointing to the data + * area allocated in arena. + */ +struct scx_task_map_val { + union sdt_id tid; + __u64 tptr; + struct sdt_data __arena *data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct scx_task_map_val); +} scx_task_map SEC(".maps"); + +static struct scx_allocator scx_task_allocator; + +__hidden +void __arena *scx_task_alloc(struct task_struct *p) +{ + struct sdt_data __arena *data = NULL; + struct scx_task_map_val *mval; + + mval = bpf_task_storage_get(&scx_task_map, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!mval) + return NULL; + + data = scx_alloc(&scx_task_allocator); + if (unlikely(!data)) + return NULL; + + mval->tid = data->tid; + mval->tptr = (__u64) p; + mval->data = data; + + return (void __arena *)data->payload; +} + +__hidden +int scx_task_init(__u64 data_size) +{ + return scx_alloc_init(&scx_task_allocator, data_size); +} + +__hidden +void __arena *scx_task_data(struct task_struct *p) +{ + struct sdt_data __arena *data; + struct scx_task_map_val *mval; + + scx_arena_subprog_init(); + + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); + if (!mval) + return NULL; + + data = mval->data; + + return (void __arena *)data->payload; +} + +__hidden +void scx_task_free(struct task_struct *p) +{ + struct scx_task_map_val *mval; + + scx_arena_subprog_init(); + + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); + if (!mval) + return; + + bpf_spin_lock(&alloc_lock); + scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); + bpf_spin_unlock(&alloc_lock); + + bpf_task_storage_delete(&scx_task_map, p); +} + +static inline void +scx_stat_global_update(struct scx_stats __arena *stats) +{ + cast_kern(stats); + __sync_fetch_and_add(&stat_enqueue, stats->enqueue); + __sync_fetch_and_add(&stat_init, stats->init); + __sync_fetch_and_add(&stat_exit, stats->exit); + __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); + __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); +} + +s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct scx_stats __arena *stats; + bool is_idle = false; + s32 cpu; + + stats = scx_task_data(p); + if (!stats) { + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); + return 0; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc_select_idle_cpu(stats); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } else { + stat_inc_select_busy_cpu(stats); + } + + return cpu; +} + +void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct scx_stats __arena *stats; + + stats = scx_task_data(p); + if (!stats) { + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); + return; + } + + stat_inc_enqueue(stats); + + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_dsq_move_to_local(SHARED_DSQ); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct scx_stats __arena *stats; + + stats = scx_task_alloc(p); + if (!stats) { + scx_bpf_error("arena allocator out of memory"); + return -ENOMEM; + } + + stats->pid = p->pid; + + stat_inc_init(stats); + + return 0; +} + +void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + struct scx_stats __arena *stats; + + stats = scx_task_data(p); + if (!stats) { + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); + return; + } + + stat_inc_exit(stats); + scx_stat_global_update(stats); + + scx_task_free(p); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) +{ + int ret; + + ret = scx_task_init(sizeof(struct scx_stats)); + if (ret < 0) { + scx_bpf_error("%s: failed with %d", __func__, ret); + return ret; + } + + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(sdt_ops, + .select_cpu = (void *)sdt_select_cpu, + .enqueue = (void *)sdt_enqueue, + .dispatch = (void *)sdt_dispatch, + .init_task = (void *)sdt_init_task, + .exit_task = (void *)sdt_exit_task, + .init = (void *)sdt_init, + .exit = (void *)sdt_exit, + .name = "sdt"); diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c new file mode 100644 index 000000000000..b0363363476d --- /dev/null +++ b/tools/sched_ext/scx_sdt.c @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Emil Tsalapatis + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include + +#include "scx_sdt.h" +#include "scx_sdt.bpf.skel.h" + +const char help_fmt[] = +"A simple arena-based sched_ext scheduler.\n" +"\n" +"Modified version of scx_simple that demonstrates arena-based data structures.\n" +"\n" +"Usage: %s [-f] [-v]\n" +"\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int sig) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_sdt *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(sdt_ops, scx_sdt); + + while ((opt = getopt(argc, argv, "fvh")) != -1) { + switch (opt) { + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei); + link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("====SCHEDULING STATS====\n"); + printf("enqueues=%llu\t", skel->bss->stat_enqueue); + printf("inits=%llu\t", skel->bss->stat_init); + printf("exits=%llu\t", skel->bss->stat_exit); + printf("\n"); + + printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu); + printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu); + printf("\n"); + + printf("====ALLOCATION STATS====\n"); + printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs); + printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs); + printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops); + printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops); + printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs); + printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used); + printf("\n\n"); + + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_sdt__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_sdt.h b/tools/sched_ext/scx_sdt.h new file mode 100644 index 000000000000..67982ce9bc9b --- /dev/null +++ b/tools/sched_ext/scx_sdt.h @@ -0,0 +1,113 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo + * Copyright (c) 2025 Emil Tsalapatis + */ +#pragma once + +#ifndef __BPF__ +#define __arena +#endif /* __BPF__ */ + +struct scx_alloc_stats { + __u64 chunk_allocs; + __u64 data_allocs; + __u64 alloc_ops; + __u64 free_ops; + __u64 active_allocs; + __u64 arena_pages_used; +}; + +struct sdt_pool { + void __arena *slab; + __u64 elem_size; + __u64 max_elems; + __u64 idx; +}; + +#ifndef div_round_up +#define div_round_up(a, b) (((a) + (b) - 1) / (b)) +#endif + +#ifndef round_up +#define round_up(a, b) (div_round_up((a), (b)) * (b)) +#endif + +typedef struct sdt_desc __arena sdt_desc_t; + +enum sdt_consts { + SDT_TASK_ENTS_PER_PAGE_SHIFT = 9, + SDT_TASK_LEVELS = 3, + SDT_TASK_ENTS_PER_CHUNK = 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT, + SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64), + SDT_TASK_MIN_ELEM_PER_ALLOC = 8, +}; + +union sdt_id { + __s64 val; + struct { + __s32 idx; /* index in the radix tree */ + __s32 genn; /* ++'d on recycle so that it forms unique'ish 64bit ID */ + }; +}; + +struct sdt_chunk; + +/* + * Each index page is described by the following descriptor which carries the + * bitmap. This way the actual index can host power-of-two numbers of entries + * which makes indexing cheaper. + */ +struct sdt_desc { + __u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S]; + __u64 nr_free; + struct sdt_chunk __arena *chunk; +}; + +/* + * Leaf node containing per-task data. + */ +struct sdt_data { + union sdt_id tid; + __u64 payload[]; +}; + +/* + * Intermediate node pointing to another intermediate node or leaf node. + */ +struct sdt_chunk { + union { + sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK]; + struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK]; + }; +}; + +struct scx_allocator { + struct sdt_pool pool; + sdt_desc_t *root; +}; + +struct scx_stats { + int seq; + pid_t pid; + __u64 enqueue; + __u64 exit; + __u64 init; + __u64 select_busy_cpu; + __u64 select_idle_cpu; +}; + +#ifdef __BPF__ + +void __arena *scx_task_data(struct task_struct *p); +int scx_task_init(__u64 data_size); +void __arena *scx_task_alloc(struct task_struct *p); +void scx_task_free(struct task_struct *p); +void scx_arena_subprog_init(void); + +int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size); +u64 scx_alloc_internal(struct scx_allocator *alloc); +int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx); + +#endif /* __BPF__ */ -- cgit v1.2.3 From 166e664e702ed96b83df2a87c1ea2138a995b604 Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Mon, 26 Jan 2026 14:15:31 +0800 Subject: selftests: ptp: use KSFT_SKIP exit code for skip scenarios The kselftest framework defines KSFT_SKIP=4 as the standard exit code for skipped tests. However, phc.sh currently uses a mix of 'exit 0' and 'exit 1' to indicate skip conditions, which can confuse test harnesses and CI systems. This patch introduces ksft_skip=4 variable and unifies all skip exit paths to use 'exit $ksft_skip', consistent with other selftests like net/lib.sh and net/fib_nexthops.sh. Signed-off-by: Junjie Cao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260126061532.12532-1-junjie.cao@intel.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/ptp/phc.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh index ac6e5a6e1d3a..51aad466d989 100755 --- a/tools/testing/selftests/ptp/phc.sh +++ b/tools/testing/selftests/ptp/phc.sh @@ -8,17 +8,20 @@ ALL_TESTS=" " DEV=$1 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ############################################################################## # Sanity checks if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" - exit 0 + exit $ksft_skip fi if [[ "$DEV" == "" ]]; then echo "SKIP: PTP device not provided" - exit 0 + exit $ksft_skip fi require_command() @@ -27,7 +30,7 @@ require_command() if [[ ! -x "$(command -v "$cmd")" ]]; then echo "SKIP: $cmd not installed" - exit 1 + exit $ksft_skip fi } @@ -37,7 +40,7 @@ phc_sanity() if [ $? != 0 ]; then echo "SKIP: unknown clock $DEV: No such device" - exit 1 + exit $ksft_skip fi } -- cgit v1.2.3 From 239f09e258b906deced5c2a7c1ac8aed301b558b Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Mon, 26 Jan 2026 14:15:32 +0800 Subject: selftests: ptp: treat unsupported PHC operations as skip Some PTP hardware clock (PHC) devices may return -EOPNOTSUPP for operations like settime, adjtime, or adjfreq. This commonly occurs with timestamp-only PHC implementations that don't support full clock control. For background, syzbot previously exposed a crash risk when PTP clock drivers lacked required callbacks[1]. Subsequent work[2] made callback presence a registration requirement. As a result, some drivers (like iwlwifi MVM/MLD[3]) now provide stub callbacks that return -EOPNOTSUPP for unsupported operations. When phc_ctl encounters such devices, the "Operation not supported" error should be treated as a skip (device limitation) rather than a test failure. This patch: - Adds [SKIP] output handling in log_test() - Detects "Operation not supported" from phc_ctl and returns ksft_skip - Returns ksft_skip if all tests are skipped, preventing false-positive results when testing timestamp-only PHC implementations Link: https://lore.kernel.org/netdev/20251028043216.1971292-1-junjie.cao@intel.com/ [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dfb073d32cac [2] Link: https://lore.kernel.org/netdev/20251204123204.9316-1-ziyao@disroot.org/ [3] Signed-off-by: Junjie Cao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260126061532.12532-2-junjie.cao@intel.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/ptp/phc.sh | 49 ++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh index 51aad466d989..9f61c1579edf 100755 --- a/tools/testing/selftests/ptp/phc.sh +++ b/tools/testing/selftests/ptp/phc.sh @@ -52,6 +52,7 @@ phc_sanity # Exit status to return at the end. Set in case one of the tests fails. EXIT_STATUS=0 +PASS_COUNT=0 # Per-test return value. Clear at the beginning of each test. RET=0 @@ -68,12 +69,18 @@ log_test() { local test_name=$1 + if [[ $RET -eq $ksft_skip ]]; then + printf "TEST: %-60s [SKIP]\n" "$test_name" + return 0 + fi + if [[ $RET -ne 0 ]]; then EXIT_STATUS=1 printf "TEST: %-60s [FAIL]\n" "$test_name" return 1 fi + ((PASS_COUNT++)) printf "TEST: %-60s [ OK ]\n" "$test_name" return 0 } @@ -92,34 +99,49 @@ tests_run() settime_do() { - local res + local res out - res=$(phc_ctl $DEV set 0 wait 120.5 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV set 0 wait 120.5 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 120 )) } adjtime_do() { - local res + local res out - res=$(phc_ctl $DEV set 0 adj 10 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV set 0 adj 10 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 10 )) } adjfreq_do() { - local res + local res out # Set the clock to be 1% faster - res=$(phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 101 )) } @@ -166,4 +188,7 @@ trap cleanup EXIT tests_run +if [[ $EXIT_STATUS -eq 0 && $PASS_COUNT -eq 0 ]]; then + exit $ksft_skip +fi exit $EXIT_STATUS -- cgit v1.2.3 From 944e3f7562c55fa37ebcdd58e5f60f296c81a854 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 27 Jan 2026 12:12:06 +0100 Subject: tools: Update context analysis macros in compiler_types.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In sync with the main kernel headers, include a stub version of compiler-context-analysis.h in tools/include/linux/compiler_types.h and remove the sparse context tracking definitions. Since tools/ headers are generally self-contained, provide a standalone tools/include/linux/compiler-context-analysis.h with no-op stubs for now. Also clean up redundant stubs in tools/testing/shared/linux/kernel.h that are now redundant. This fixes build errors in tools/testing/radix-tree/ where headers from include/linux/ (like cleanup.h) are used directly and expect these macros to be defined: | cc -I../shared -I. -I../../include -I../../arch/x86/include -I../../../lib -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined -c -o radix-tree.o radix-tree.c | In file included from ../shared/linux/cleanup.h:2, | from ../shared/linux/../../../../include/linux/idr.h:18, | from ../shared/linux/idr.h:5, | from radix-tree.c:18: | ../shared/linux/../../../../include/linux/idr.h: In function ‘class_idr_alloc_destructor’: | ../shared/linux/../../../../include/linux/cleanup.h:283:9: error: expected declaration specifiers before ‘__no_context_analysis’ | 283 | __no_context_analysis \ | | ^~~~~~~~~~~~~~~~~~~~~ Closes: https://lore.kernel.org/oe-lkp/202601261546.d7ae2447-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Tested-by: Lorenzo Stoakes Link: https://patch.msgid.link/20260127111428.3747328-1-elver@google.com --- tools/include/linux/compiler-context-analysis.h | 42 +++++++++++++++++++++++++ tools/include/linux/compiler_types.h | 16 +--------- tools/testing/shared/linux/kernel.h | 4 --- 3 files changed, 43 insertions(+), 19 deletions(-) create mode 100644 tools/include/linux/compiler-context-analysis.h (limited to 'tools') diff --git a/tools/include/linux/compiler-context-analysis.h b/tools/include/linux/compiler-context-analysis.h new file mode 100644 index 000000000000..13a9115e9e58 --- /dev/null +++ b/tools/include/linux/compiler-context-analysis.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H +#define _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H + +/* + * Macros and attributes for compiler-based static context analysis. + * No-op stubs for tools. + */ + +#define __guarded_by(...) +#define __pt_guarded_by(...) + +#define context_lock_struct(name, ...) struct __VA_ARGS__ name + +#define __no_context_analysis +#define __context_unsafe(comment) +#define context_unsafe(...) ({ __VA_ARGS__; }) +#define context_unsafe_alias(p) +#define disable_context_analysis() +#define enable_context_analysis() + +#define __must_hold(...) +#define __must_not_hold(...) +#define __acquires(...) +#define __cond_acquires(ret, x) +#define __releases(...) +#define __acquire(x) (void)0 +#define __release(x) (void)0 + +#define __must_hold_shared(...) +#define __acquires_shared(...) +#define __cond_acquires_shared(ret, x) +#define __releases_shared(...) +#define __acquire_shared(x) (void)0 +#define __release_shared(x) (void)0 + +#define __acquire_ret(call, expr) (call) +#define __acquire_shared_ret(call, expr) (call) +#define __acquires_ret +#define __acquires_shared_ret + +#endif /* _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H */ diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 067a5b4e0f7b..14e420467eee 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -13,21 +13,7 @@ #define __has_builtin(x) (0) #endif -#ifdef __CHECKER__ -/* context/locking */ -# define __must_hold(x) __attribute__((context(x,1,1))) -# define __acquires(x) __attribute__((context(x,0,1))) -# define __releases(x) __attribute__((context(x,1,0))) -# define __acquire(x) __context__(x,1) -# define __release(x) __context__(x,-1) -#else /* __CHECKER__ */ -/* context/locking */ -# define __must_hold(x) -# define __acquires(x) -# define __releases(x) -# define __acquire(x) (void)0 -# define __release(x) (void)0 -#endif /* __CHECKER__ */ +#include /* Compiler specific macros. */ #ifdef __GNUC__ diff --git a/tools/testing/shared/linux/kernel.h b/tools/testing/shared/linux/kernel.h index c0a2bb785b92..dc2b4ccfb185 100644 --- a/tools/testing/shared/linux/kernel.h +++ b/tools/testing/shared/linux/kernel.h @@ -21,9 +21,5 @@ #define schedule() #define PAGE_SHIFT 12 -#define __acquires(x) -#define __releases(x) -#define __must_hold(x) - #define EXPORT_PER_CPU_SYMBOL_GPL(x) #endif /* _KERNEL_H */ -- cgit v1.2.3 From 96e004b4bdf9029d137a1b06de1606ff6e263ab8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 27 Jan 2026 16:16:14 +0000 Subject: kselftest/arm64: Add a no-SVE loop after SVE in fp-pidbench Some applications use SVE intermittently, one common case being where SVE is used during statup (eg, by ld.so) but then rarely if ever during the main application runtime. Add a repeat of the no SVE loop after we've done the SVE loops to fp-pidbench to capture results for that. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-pidbench.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S index 73830f6bc99b..aeeadc7873dc 100644 --- a/tools/testing/selftests/arm64/fp/fp-pidbench.S +++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S @@ -63,6 +63,10 @@ function _start puts "SVE used per syscall: " test_loop "rdvl x0, #8" + // Test non-SVE execution after SVE + puts "No SVE after SVE: " + test_loop + // And we're done out: mov x0, #0 -- cgit v1.2.3 From b661d753ce2ee951558db1f2c7b97f32d9431966 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 27 Jan 2026 16:16:15 +0000 Subject: kselftest/arm64: Raise default number of loops in fp-pidbench When fp-pidbench was originally written SVE hardware was not widely available so it was useful to run it in emulation and the default number of loops was set very low, running for less than a second on actual hardware. Now that SVE hardware is reasonably available it is very much less interesting to use emulation, bump the default number of loops up to even out a bit of the noise on real systems. On the machine I have to hand this now takes about 15s which is still a toy microbenchmark but perhaps a bit more useful. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-pidbench.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S index aeeadc7873dc..881dfa3b342e 100644 --- a/tools/testing/selftests/arm64/fp/fp-pidbench.S +++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S @@ -33,7 +33,7 @@ function _start puts "Iterations per test: " mov x20, #10000 - lsl x20, x20, #8 + lsl x20, x20, #12 mov x0, x20 bl putdec puts "\n" -- cgit v1.2.3 From 2a85bbaed06bec2b97dec15bb01cbdbf81dce3e3 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Tue, 27 Jan 2026 18:49:55 +0000 Subject: perf header: Replace hardcoded max cpus by MAX_NR_CPUS cpumask and cpulist from cpu-domain header have hardcoded max_cpus value of 1024. Current systems have more cpus than this value. Replace it with MAX_NR_CPUS. Also define a macro to represent domain name length. Fixes: d40c68a49f69c9bd ("perf header: Support CPU DOMAIN relation info") Reported-by: Shrikanth Hegde Reviewed-by: Shrikanth Hegde Signed-off-by: Swapnil Sapkal Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anubhav Shelat Cc: Chen Yu Cc: Gautham Shenoy Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Falcon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 5 +++-- tools/perf/util/util.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index eefd1cd73b6a..31c3bab1b10a 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -76,6 +76,7 @@ static const u64 __perf_magic2 = 0x32454c4946524550ULL; static const u64 __perf_magic2_sw = 0x50455246494c4532ULL; #define PERF_MAGIC __perf_magic2 +#define DNAME_LEN 16 const char perf_version_string[] = PERF_VERSION; @@ -1616,10 +1617,10 @@ static int write_pmu_caps(struct feat_fd *ff, struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, u32 nr) { + char dname[DNAME_LEN], cpumask[MAX_NR_CPUS]; struct domain_info *domain_info; struct cpu_domain_map **cd_map; - char dname[16], cpumask[256]; - char cpulist[1024]; + char cpulist[MAX_NR_CPUS]; char *line = NULL; u32 cpu, domain; u32 dcount = 0; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 03a603fbcd7d..94f5a2ece245 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include "perf.h" #include "util.h" #include "debug.h" #include "event.h" @@ -262,7 +263,7 @@ void cpumask_to_cpulist(char *cpumask, char *cpulist) int i, j, bm_size, nbits; int len = strlen(cpumask); unsigned long *bm; - char cpus[1024]; + char cpus[MAX_NR_CPUS]; for (i = 0; i < len; i++) { if (cpumask[i] == ',') { -- cgit v1.2.3 From 05134d15375ce9fc57a91453999729d861efe9f9 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Tue, 27 Jan 2026 18:49:56 +0000 Subject: perf util: Fix NULL check in cpumask_to_cpulist() The function cpumask_to_cpulist() allocates memory with calloc() and stores the result in 'bm', but then incorrectly checks 'cpumask' for NULL instead of 'bm'. This means that if the allocation fails, the function will dereference a NULL pointer when trying to access 'bm'. Fix the check to test the correct variable 'bm'. Fixes: d40c68a49f69c9bd ("perf header: Support CPU DOMAIN relation info") Reviewed-by: Shrikanth Hegde Signed-off-by: Swapnil Sapkal Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anubhav Shelat Cc: Chen Yu Cc: Gautham Shenoy Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Falcon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 94f5a2ece245..8b893de35f77 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -279,7 +279,7 @@ void cpumask_to_cpulist(char *cpumask, char *cpulist) return; bm = calloc(bm_size, sizeof(unsigned long)); - if (!cpumask) + if (!bm) goto free_bm; for (i = 0; i < bm_size; i++) { -- cgit v1.2.3 From b03b95b4d71edbb10bfb1588dc59cfea8d13796c Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Tue, 27 Jan 2026 18:49:57 +0000 Subject: perf sched stats: Add NULL check for cd_map In perf_sched__schedstat_live(), build_cpu_domain_map() returns the pointer to cpu_domain_map which can also be NULL. Add NULL check for the same to avoid NULL pointer dereference. Fixes: 00093b3133984ffe ("perf sched stats: Add support for live mode") Reviewed-by: Shrikanth Hegde Signed-off-by: Swapnil Sapkal Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anubhav Shelat Cc: Chen Yu Cc: Gautham Shenoy Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Falcon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b190e928117c..f5e449bd6823 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -4714,6 +4714,11 @@ static int perf_sched__schedstat_live(struct perf_sched *sched, nr = cpu__max_present_cpu().cpu; cd_map = build_cpu_domain_map(&sv, &md, nr); + if (!cd_map) { + pr_err("Unable to generate cpu-domain relation info"); + goto out; + } + show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false); free_cpu_domain_info(cd_map, sv, nr); out: -- cgit v1.2.3 From 7284dc7e19fab1ff425579cbb4721fff292ac70e Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Tue, 27 Jan 2026 18:49:58 +0000 Subject: perf sched stats: correct spelling of function name Replace store_schedtstat_cpu_diff() with store_schedstat_cpu_diff() Reviewed-by: Shrikanth Hegde Signed-off-by: Swapnil Sapkal Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anubhav Shelat Cc: Chen Yu Cc: Gautham Shenoy Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Falcon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f5e449bd6823..1a24c4869331 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3946,7 +3946,7 @@ static struct schedstat_domain *domain_second_pass; static bool after_workload_flag; static bool verbose_field; -static void store_schedtstat_cpu_diff(struct schedstat_cpu *after_workload) +static void store_schedstat_cpu_diff(struct schedstat_cpu *after_workload) { struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data; struct perf_record_schedstat_cpu *after = after_workload->cpu_data; @@ -4437,7 +4437,7 @@ static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_un } domain_second_pass = list_first_entry(&cpu_second_pass->domain_head, struct schedstat_domain, domain_list); - store_schedtstat_cpu_diff(temp); + store_schedstat_cpu_diff(temp); } } else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) { struct schedstat_cpu *cpu_tail; -- cgit v1.2.3 From f7dc49645346d9d47825b60c4557da6885d48037 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Tue, 27 Jan 2026 18:49:59 +0000 Subject: perf sched stats: Define macro for SEP_LEN Define a macro for separator length of the line in perf sched stats report. Reviewed-by: Shrikanth Hegde Signed-off-by: Swapnil Sapkal Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anubhav Shelat Cc: Chen Yu Cc: Gautham Shenoy Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Falcon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 1a24c4869331..3f509cfdd58c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -55,6 +55,7 @@ #define SYM_LEN 129 #define MAX_PID 1024000 #define MAX_PRIO 140 +#define SEP_LEN 100 static const char *cpu_list; static struct perf_cpu_map *user_requested_cpus; @@ -3997,7 +3998,7 @@ static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1, "PCT_CHANGE1", "PCT_CHANGE2"); printf("\n"); - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); #define CALC_PCT(_x, _y) ((_y) ? ((double)(_x) / (_y)) * 100 : 0.0) @@ -4047,8 +4048,8 @@ static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1, #define DOMAIN_CATEGORY(_desc) \ do { \ size_t _len = strlen(_desc); \ - size_t _pre_dash_cnt = (100 - _len) / 2; \ - size_t _post_dash_cnt = 100 - _len - _pre_dash_cnt; \ + size_t _pre_dash_cnt = (SEP_LEN - _len) / 2; \ + size_t _post_dash_cnt = SEP_LEN - _len - _pre_dash_cnt; \ print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\ } while (0) @@ -4238,14 +4239,14 @@ static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map ** int ret = 0; printf("Description\n"); - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); printf("%-30s-> %s\n", "DESC", "Description of the field"); printf("%-30s-> %s\n", "COUNT", "Value of the field"); printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value"); printf("%-30s-> %s\n", "AVG_JIFFIES", "Avg time in jiffies between two consecutive occurrence of event"); - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); printf("\n"); printf("%-65s: ", "Time elapsed (in jiffies)"); @@ -4286,16 +4287,16 @@ static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map ** return -1; } - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); if (is_summary) printf("CPU: \n"); else printf("CPU: %d\n", cs1->cpu); - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); print_cpu_stats(cs1, cs2); - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) { struct domain_info *dinfo1 = NULL, *dinfo2 = NULL; @@ -4329,9 +4330,9 @@ static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map ** printf("%s\n", dinfo1->cpulist); } - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); print_domain_stats(ds1, ds2, jiffies1, jiffies2); - print_separator2(100, "", 0); + print_separator2(SEP_LEN, "", 0); if (dptr2) dptr2 = list_next_entry(dptr2, domain_list); -- cgit v1.2.3 From 34b0a58eef04a49376e4103efc5b09f1e33e594a Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Tue, 27 Jan 2026 18:50:00 +0000 Subject: perf sched stats: Fixes in man page Fix the incorrect description of the schedstats report. Also fix the spelling errors in man page. Fixes: 800af362d68945e5 ("perf sched stats: Add details in man page") Reviewed-by: Shrikanth Hegde Reported-by: Shrikanth Hegde Signed-off-by: Swapnil Sapkal Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anubhav Shelat Cc: Chen Yu Cc: Gautham Shenoy Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Falcon Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 5bfb7bb6c633..4d9981609c04 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -86,7 +86,7 @@ There are several variants of 'perf sched': exposed through the file ``/proc/schedstat``. These counters are enabled or disabled via the sysctl governed by the file ``/proc/sys/kernel/sched_schedstats``. These counters accounts for many scheduler events such as ``schedule()`` calls, load-balancing - events, ``try_to_wakeup()`` call among others. This is useful in understading the + events, ``try_to_wakeup()`` call among others. This is useful in understanding the scheduler behavior for the workload. Note: The tool will not give correct results if there is topological reordering or @@ -100,7 +100,7 @@ There are several variants of 'perf sched': A detailed description of the schedstats can be found in the Kernel Documentation: https://www.kernel.org/doc/html/latest/scheduler/sched-stats.html - The result can be interprested as follows: + The result can be interpreted as follows: The `perf sched stats report` starts with description of the columns present in the report. These column names are given before cpu and domain stats to improve @@ -116,7 +116,7 @@ There are several variants of 'perf sched': Next is the total profiling time in terms of jiffies: ---------------------------------------------------------------------------------------------------- - Time elapsed (in jiffies) : 24537 + Time elapsed (in jiffies) : 2323 ---------------------------------------------------------------------------------------------------- Next is CPU scheduling statistics. These are simple diffs of /proc/schedstat CPU lines @@ -210,7 +210,7 @@ There are several variants of 'perf sched': While profiling was active, the load-balancer found 28490 times the load needs to be balanced on a newly idle CPU 0. Following value encapsulated - inside $ is average jiffies between two events (28490 / 24537 = 0.08). + inside $ is average jiffies between two events (2323 / 28490 = 0.08). Next are active_load_balance() stats. alb did not trigger while the profiling was active, hence it's all 0s. -- cgit v1.2.3 From a537c0da168a08b0b6a7f7bd9e75f4cc8d45ff57 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 23 Jan 2026 13:32:03 +0000 Subject: tools: Fix bitfield dependency failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A perf build failure was reported by Thomas Voegtle on stable kernel v6.6.120: CC tests/sample-parsing.o CC util/intel-pt-decoder/intel-pt-pkt-decoder.o CC util/perf-regs-arch/perf_regs_csky.o CC util/arm-spe-decoder/arm-spe-pkt-decoder.o CC util/perf-regs-arch/perf_regs_loongarch.o In file included from util/arm-spe-decoder/arm-spe-pkt-decoder.h:10, from util/arm-spe-decoder/arm-spe-pkt-decoder.c:14: /local/git/linux-stable-rc/tools/include/linux/bitfield.h: In function ‘le16_encode_bits’: /local/git/linux-stable-rc/tools/include/linux/bitfield.h:166:31: error: implicit declaration of function ‘cpu_to_le16’; did you mean ‘htole16’? [-Werror=implicit-function-declaration] ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ ^~~~~~~~~ /local/git/linux-stable-rc/tools/include/linux/bitfield.h:149:9: note: in definition of macro ‘____MAKE_OP’ return to((v & field_mask(field)) * field_multiplier(field)); \ ^~ /local/git/linux-stable-rc/tools/include/linux/bitfield.h:170:1: note: in expansion of macro ‘__MAKE_OP’ __MAKE_OP(16) Fix this by including linux/kernel.h, which provides the required definitions. The issue was not found on the mainline due to the relevant C files have included kernel.h. It'd be good to merge this change on mainline as well for robustness. Closes: https://lore.kernel.org/stable/3a44500b-d7c8-179f-61f6-e51cb50d3512@lio96.de/ Fixes: 64d86c03e1441742 ("perf arm-spe: Extend branch operations") Reported-by: Hamza Mahfooz Reported-by: Thomas Voegtle Signed-off-by: Leo Yan Cc: Greg Kroah-Hartman Cc: Ian Rogers Cc: James Clark Cc: Leo Yan Cc: Namhyung Kim To: Sasha Levin Cc: stable@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/bitfield.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/linux/bitfield.h b/tools/include/linux/bitfield.h index 6093fa6db260..ddf81f24956b 100644 --- a/tools/include/linux/bitfield.h +++ b/tools/include/linux/bitfield.h @@ -8,6 +8,7 @@ #define _LINUX_BITFIELD_H #include +#include #include /* -- cgit v1.2.3 From 7a0ba3891104da77cfd1a16d41699e0fdf45603a Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 23 Jan 2026 13:32:04 +0000 Subject: perf: Remove redundant kernel.h include Now that the bitfield dependency is resolved, the explicit inclusion of kernel.h is no longer needed. Remove the redundant include. Signed-off-by: Leo Yan Cc: Ian Rogers Cc: James Clark Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/header.c | 1 - tools/perf/util/cs-etm.c | 1 - 2 files changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index f445a2dd6293..cbc0ba101636 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -1,4 +1,3 @@ -#include #include #include #include diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 12b55c2bc2ca..95f439c96180 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -6,7 +6,6 @@ * Author: Mathieu Poirier */ -#include #include #include #include -- cgit v1.2.3 From 19eab0efe72f02516b9f194a6ad10e7c83a009ae Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:32 -0800 Subject: perf jevents: Build support for generating metrics from python Generate extra-metrics.json and extra-metricgroups.json from python architecture specific scripts. The metrics themselves will be added in later patches. If a build takes place in tools/perf/ then extra-metrics.json and extra-metricgroups.json are generated in that directory and so added to .gitignore. If there is an OUTPUT directory then the tools/perf/pmu-events/arch files are copied to it so the generated extra-metrics.json and extra-metricgroups.json can be added/generated there. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/.gitignore | 5 ++++ tools/perf/Makefile.perf | 2 ++ tools/perf/pmu-events/Build | 52 +++++++++++++++++++++++++++++++++- tools/perf/pmu-events/amd_metrics.py | 42 +++++++++++++++++++++++++++ tools/perf/pmu-events/arm64_metrics.py | 43 ++++++++++++++++++++++++++++ tools/perf/pmu-events/intel_metrics.py | 42 +++++++++++++++++++++++++++ 6 files changed, 185 insertions(+), 1 deletion(-) create mode 100755 tools/perf/pmu-events/amd_metrics.py create mode 100755 tools/perf/pmu-events/arm64_metrics.py create mode 100755 tools/perf/pmu-events/intel_metrics.py (limited to 'tools') diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index 5c59f954f52a..0f9451a6e39c 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -43,6 +43,11 @@ pmu-events/metric_test.log pmu-events/empty-pmu-events.log pmu-events/test-empty-pmu-events.c *.shellcheck_log +pmu-events/arch/**/extra-metrics.json +pmu-events/arch/**/extra-metricgroups.json +tests/shell/*.shellcheck_log +tests/shell/coresight/*.shellcheck_log +tests/shell/lib/*.shellcheck_log feature/ libapi/ libbpf/ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 45d5a59a02cb..b6edc8100c8e 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1273,6 +1273,8 @@ ifeq ($(OUTPUT),) pmu-events/metric_test.log \ pmu-events/test-empty-pmu-events.c \ pmu-events/empty-pmu-events.log + $(Q)find pmu-events/arch -name 'extra-metrics.json' -delete -o \ + -name 'extra-metricgroups.json' -delete else # When an OUTPUT directory is present, clean up the copied pmu-events/arch directory. $(call QUIET_CLEAN, pmu-events) $(RM) -r $(OUTPUT)pmu-events/arch \ $(OUTPUT)pmu-events/pmu-events.c \ diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index 4f9ef624ba70..ba2662d441c4 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -30,6 +30,10 @@ $(PMU_EVENTS_C): $(EMPTY_PMU_EVENTS_C) $(call rule_mkdir) $(Q)$(call echo-cmd,gen)cp $< $@ else +# Functions to extract the model from a extra-metrics.json or extra-metricgroups.json path. +model_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/extra-metric.*\.json@\1@') +vendor_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/[^/]*/extra-metric.*\.json@\1@') + # Copy checked-in json to OUTPUT for generation if it's an out of source build ifneq ($(OUTPUT),) # Remove all output directories when any source directory timestamp changes @@ -48,7 +52,53 @@ $(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT) $(call rule_mkdir) $(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@ -GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON) $(JSON_DIRS) +GEN_METRIC_DEPS := pmu-events/metric.py + +# Generate AMD Json +ZENS = $(shell ls -d pmu-events/arch/x86/amdzen*) +ZEN_METRICS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metrics.json) +ZEN_METRICGROUPS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metricgroups.json) + +$(ZEN_METRICS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@ + +$(ZEN_METRICGROUPS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@ + +# Generate ARM Json +ARMS = $(shell ls -d pmu-events/arch/arm64/arm/*|grep -v cmn) +ARM_METRICS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metrics.json) +ARM_METRICGROUPS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metricgroups.json) + +$(ARM_METRICS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) arch > $@ + +$(ARM_METRICGROUPS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) arch > $@ + +# Generate Intel Json +INTELS = $(shell ls -d pmu-events/arch/x86/*|grep -v amdzen|grep -v mapfile.csv) +INTEL_METRICS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metrics.json) +INTEL_METRICGROUPS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metricgroups.json) + +$(INTEL_METRICS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@ + +$(INTEL_METRICGROUPS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@ + +GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) \ + $(LEGACY_CACHE_JSON) \ + $(JSON_DIRS) \ + $(ZEN_METRICS) $(ZEN_METRICGROUPS) \ + $(ARM_METRICS) $(ARM_METRICGROUPS) \ + $(INTEL_METRICS) $(INTEL_METRICGROUPS) $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY) $(call rule_mkdir) diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py new file mode 100755 index 000000000000..5f44687d8d20 --- /dev/null +++ b/tools/perf/pmu-events/amd_metrics.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +import argparse +import os +from metric import ( + JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup) + +# Global command line arguments. +_args = None + + +def main() -> None: + global _args + + def dir_path(path: str) -> str: + """Validate path is a directory for argparse.""" + if os.path.isdir(path): + return path + raise argparse.ArgumentTypeError( + f'\'{path}\' is not a valid directory') + + parser = argparse.ArgumentParser(description="AMD perf json generator") + parser.add_argument( + "-metricgroups", help="Generate metricgroups data", action='store_true') + parser.add_argument("model", help="e.g. amdzen[123]") + parser.add_argument( + 'events_path', + type=dir_path, + help='Root of tree containing architecture directories containing json files' + ) + _args = parser.parse_args() + + all_metrics = MetricGroup("", []) + + if _args.metricgroups: + print(JsonEncodeMetricGroupDescriptions(all_metrics)) + else: + print(JsonEncodeMetric(all_metrics)) + + +if __name__ == '__main__': + main() diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py new file mode 100755 index 000000000000..204b3b08c680 --- /dev/null +++ b/tools/perf/pmu-events/arm64_metrics.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +import argparse +import os +from metric import ( + JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup) + +# Global command line arguments. +_args = None + + +def main() -> None: + global _args + + def dir_path(path: str) -> str: + """Validate path is a directory for argparse.""" + if os.path.isdir(path): + return path + raise argparse.ArgumentTypeError( + f'\'{path}\' is not a valid directory') + + parser = argparse.ArgumentParser(description="ARM perf json generator") + parser.add_argument( + "-metricgroups", help="Generate metricgroups data", action='store_true') + parser.add_argument("vendor", help="e.g. arm") + parser.add_argument("model", help="e.g. neoverse-n1") + parser.add_argument( + 'events_path', + type=dir_path, + help='Root of tree containing architecture directories containing json files' + ) + _args = parser.parse_args() + + all_metrics = MetricGroup("", []) + + if _args.metricgroups: + print(JsonEncodeMetricGroupDescriptions(all_metrics)) + else: + print(JsonEncodeMetric(all_metrics)) + + +if __name__ == '__main__': + main() diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py new file mode 100755 index 000000000000..65ada006d05a --- /dev/null +++ b/tools/perf/pmu-events/intel_metrics.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +import argparse +import os +from metric import ( + JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup) + +# Global command line arguments. +_args = None + + +def main() -> None: + global _args + + def dir_path(path: str) -> str: + """Validate path is a directory for argparse.""" + if os.path.isdir(path): + return path + raise argparse.ArgumentTypeError( + f'\'{path}\' is not a valid directory') + + parser = argparse.ArgumentParser(description="Intel perf json generator") + parser.add_argument( + "-metricgroups", help="Generate metricgroups data", action='store_true') + parser.add_argument("model", help="e.g. skylakex") + parser.add_argument( + 'events_path', + type=dir_path, + help='Root of tree containing architecture directories containing json files' + ) + _args = parser.parse_args() + + all_metrics = MetricGroup("", []) + + if _args.metricgroups: + print(JsonEncodeMetricGroupDescriptions(all_metrics)) + else: + print(JsonEncodeMetric(all_metrics)) + + +if __name__ == '__main__': + main() -- cgit v1.2.3 From 6bd6c5ef6c7ae35000179c9db8ecdbc3e9698d72 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:33 -0800 Subject: perf jevents: Add load event JSON to verify and allow fallbacks Add a LoadEvents function that loads all event JSON files in a directory. In the Event constructor ensure all events are defined in the event JSON except for legacy events like "cycles". If the initial event isn't found then legacy_event1 is used, and if that isn't found legacy_event2 is used. This allows a single Event to have multiple event names as models will often rename the same event over time. If the event doesn't exist an exception is raised. So that references to metrics can be added, add the MetricRef class. This doesn't validate as an event name and so provides an escape hatch for metrics to refer to each other. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/Build | 12 ++--- tools/perf/pmu-events/amd_metrics.py | 7 ++- tools/perf/pmu-events/arm64_metrics.py | 7 ++- tools/perf/pmu-events/intel_metrics.py | 7 ++- tools/perf/pmu-events/metric.py | 83 ++++++++++++++++++++++++++++++++-- 5 files changed, 101 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index ba2662d441c4..68227614d0b1 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -61,11 +61,11 @@ ZEN_METRICGROUPS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metricgroups.json) $(ZEN_METRICS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS) $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@ + $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) pmu-events/arch > $@ $(ZEN_METRICGROUPS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS) $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@ + $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@ # Generate ARM Json ARMS = $(shell ls -d pmu-events/arch/arm64/arm/*|grep -v cmn) @@ -74,11 +74,11 @@ ARM_METRICGROUPS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metricgroups.json) $(ARM_METRICS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS) $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) arch > $@ + $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@ $(ARM_METRICGROUPS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS) $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) arch > $@ + $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@ # Generate Intel Json INTELS = $(shell ls -d pmu-events/arch/x86/*|grep -v amdzen|grep -v mapfile.csv) @@ -87,11 +87,11 @@ INTEL_METRICGROUPS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metricgroups.json $(INTEL_METRICS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS) $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@ + $(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) pmu-events/arch > $@ $(INTEL_METRICGROUPS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS) $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@ + $(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@ GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) \ $(LEGACY_CACHE_JSON) \ diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index 5f44687d8d20..bc91d9c120fa 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -2,8 +2,8 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) import argparse import os -from metric import ( - JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup) +from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, + MetricGroup) # Global command line arguments. _args = None @@ -30,6 +30,9 @@ def main() -> None: ) _args = parser.parse_args() + directory = f"{_args.events_path}/x86/{_args.model}/" + LoadEvents(directory) + all_metrics = MetricGroup("", []) if _args.metricgroups: diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py index 204b3b08c680..ac717ca3513a 100755 --- a/tools/perf/pmu-events/arm64_metrics.py +++ b/tools/perf/pmu-events/arm64_metrics.py @@ -2,8 +2,8 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) import argparse import os -from metric import ( - JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup) +from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, + MetricGroup) # Global command line arguments. _args = None @@ -31,6 +31,9 @@ def main() -> None: ) _args = parser.parse_args() + directory = f"{_args.events_path}/arm64/{_args.vendor}/{_args.model}/" + LoadEvents(directory) + all_metrics = MetricGroup("", []) if _args.metricgroups: diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 65ada006d05a..b287ef115193 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -2,8 +2,8 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) import argparse import os -from metric import ( - JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup) +from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, + MetricGroup) # Global command line arguments. _args = None @@ -30,6 +30,9 @@ def main() -> None: ) _args = parser.parse_args() + directory = f"{_args.events_path}/x86/{_args.model}/" + LoadEvents(directory) + all_metrics = MetricGroup("", []) if _args.metricgroups: diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py index dd8fd06940e6..e33e163b2815 100644 --- a/tools/perf/pmu-events/metric.py +++ b/tools/perf/pmu-events/metric.py @@ -3,10 +3,56 @@ import ast import decimal import json +import os import re from enum import Enum from typing import Dict, List, Optional, Set, Tuple, Union +all_events = set() + +def LoadEvents(directory: str) -> None: + """Populate a global set of all known events for the purpose of validating Event names""" + global all_events + all_events = { + "context\\-switches", + "cpu\\-cycles", + "cycles", + "duration_time", + "instructions", + "l2_itlb_misses", + } + for file in os.listdir(os.fsencode(directory)): + filename = os.fsdecode(file) + if filename.endswith(".json"): + try: + for x in json.load(open(f"{directory}/{filename}")): + if "EventName" in x: + all_events.add(x["EventName"]) + elif "ArchStdEvent" in x: + all_events.add(x["ArchStdEvent"]) + except json.decoder.JSONDecodeError: + # The generated directory may be the same as the input, which + # causes partial json files. Ignore errors. + pass + + +def CheckEvent(name: str) -> bool: + """Check the event name exists in the set of all loaded events""" + global all_events + if len(all_events) == 0: + # No events loaded so assume any event is good. + return True + + if ':' in name: + # Remove trailing modifier. + name = name[:name.find(':')] + elif '/' in name: + # Name could begin with a PMU or an event, for now assume it is good. + return True + + return name in all_events + + class MetricConstraint(Enum): GROUPED_EVENTS = 0 NO_GROUP_EVENTS = 1 @@ -317,9 +363,18 @@ def _FixEscapes(s: str) -> str: class Event(Expression): """An event in an expression.""" - def __init__(self, name: str, legacy_name: str = ''): - self.name = _FixEscapes(name) - self.legacy_name = _FixEscapes(legacy_name) + def __init__(self, *args: str): + error = "" + for name in args: + if CheckEvent(name): + self.name = _FixEscapes(name) + return + if error: + error += " or " + name + else: + error = name + global all_events + raise Exception(f"No event {error} in:\n{all_events}") def ToPerfJson(self): result = re.sub('/', '@', self.name) @@ -338,6 +393,28 @@ class Event(Expression): return self +class MetricRef(Expression): + """A metric reference in an expression.""" + + def __init__(self, name: str): + self.name = _FixEscapes(name) + + def ToPerfJson(self): + return self.name + + def ToPython(self): + return f'MetricRef(r"{self.name}")' + + def Simplify(self) -> Expression: + return self + + def Equals(self, other: Expression) -> bool: + return isinstance(other, MetricRef) and self.name == other.name + + def Substitute(self, name: str, expression: Expression) -> Expression: + return self + + class Constant(Expression): """A constant within the expression tree.""" -- cgit v1.2.3 From d10ae3a935e7fdc2b149d81e617069fcde9864e3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:34 -0800 Subject: perf jevents: Add RAPL event metric for AMD zen models Add power per second metrics based on RAPL. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index bc91d9c120fa..b6cdeb4f09fe 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -1,13 +1,36 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) import argparse +import math import os -from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, - MetricGroup) +from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, + LoadEvents, Metric, MetricGroup, Select) # Global command line arguments. _args = None +interval_sec = Event("duration_time") + + +def Rapl() -> MetricGroup: + """Processor socket power consumption estimate. + + Use events from the running average power limit (RAPL) driver. + """ + # Watts = joules/second + # Currently only energy-pkg is supported by AMD: + # https://lore.kernel.org/lkml/20220105185659.643355-1-eranian@google.com/ + pkg = Event("power/energy\\-pkg/") + cond_pkg = Select(pkg, has_event(pkg), math.nan) + scale = 2.3283064365386962890625e-10 + metrics = [ + Metric("lpm_cpu_power_pkg", "", + d_ratio(cond_pkg * scale, interval_sec), "Watts"), + ] + + return MetricGroup("lpm_cpu_power", metrics, + description="Processor socket power consumption estimates") + def main() -> None: global _args @@ -33,7 +56,9 @@ def main() -> None: directory = f"{_args.events_path}/x86/{_args.model}/" LoadEvents(directory) - all_metrics = MetricGroup("", []) + all_metrics = MetricGroup("", [ + Rapl(), + ]) if _args.metricgroups: print(JsonEncodeMetricGroupDescriptions(all_metrics)) -- cgit v1.2.3 From 6da95e1834480f94f99afde0caabc63159eab64d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:35 -0800 Subject: perf jevents: Add idle metric for AMD zen models Compute using the MSR PMU the percentage of wallclock cycles where the CPUs are in a low power state. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index b6cdeb4f09fe..f51a044b8005 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -3,8 +3,9 @@ import argparse import math import os -from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, - LoadEvents, Metric, MetricGroup, Select) +from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric, + JsonEncodeMetricGroupDescriptions, LoadEvents, Metric, + MetricGroup, Select) # Global command line arguments. _args = None @@ -12,6 +13,16 @@ _args = None interval_sec = Event("duration_time") +def Idle() -> Metric: + cyc = Event("msr/mperf/") + tsc = Event("msr/tsc/") + low = max(tsc - cyc, 0) + return Metric( + "lpm_idle", + "Percentage of total wallclock cycles where CPUs are in low power state (C1 or deeper sleep state)", + d_ratio(low, tsc), "100%") + + def Rapl() -> MetricGroup: """Processor socket power consumption estimate. @@ -57,6 +68,7 @@ def main() -> None: LoadEvents(directory) all_metrics = MetricGroup("", [ + Idle(), Rapl(), ]) -- cgit v1.2.3 From 9c9efc7462487c85a269275655807631fba760fc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:36 -0800 Subject: perf jevents: Add upc metric for uops per cycle for AMD The metric adjusts for whether or not SMT is on. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index f51a044b8005..42e46b33334d 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -3,14 +3,26 @@ import argparse import math import os +from typing import Optional from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric, - JsonEncodeMetricGroupDescriptions, LoadEvents, Metric, - MetricGroup, Select) + JsonEncodeMetricGroupDescriptions, Literal, LoadEvents, + Metric, MetricGroup, Select) # Global command line arguments. _args = None - +_zen_model: int = 1 interval_sec = Event("duration_time") +ins = Event("instructions") +cycles = Event("cycles") +# Number of CPU cycles scaled for SMT. +smt_cycles = Select(cycles / 2, Literal("#smt_on"), cycles) + + +def AmdUpc() -> Metric: + ops = Event("ex_ret_ops", "ex_ret_cops") + upc = d_ratio(ops, smt_cycles) + return Metric("lpm_upc", "Micro-ops retired per core cycle (higher is better)", + upc, "uops/cycle") def Idle() -> Metric: @@ -45,6 +57,7 @@ def Rapl() -> MetricGroup: def main() -> None: global _args + global _zen_model def dir_path(path: str) -> str: """Validate path is a directory for argparse.""" @@ -67,7 +80,10 @@ def main() -> None: directory = f"{_args.events_path}/x86/{_args.model}/" LoadEvents(directory) + _zen_model = int(_args.model[6:]) + all_metrics = MetricGroup("", [ + AmdUpc(), Idle(), Rapl(), ]) -- cgit v1.2.3 From 78067ae26815f8432f8648657ea14c0e3c4f93ad Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:37 -0800 Subject: perf jevents: Add br metric group for branch statistics on AMD The br metric group for branches itself comprises metric groups for total, taken, conditional, fused and far metric groups using JSON events. The lack of conditional events on anything but zen2 means this category is lacking on zen1, zen3 and zen4. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 105 +++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index 42e46b33334d..38948f63cb52 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -18,6 +18,110 @@ cycles = Event("cycles") smt_cycles = Select(cycles / 2, Literal("#smt_on"), cycles) +def AmdBr(): + def Total() -> MetricGroup: + br = Event("ex_ret_brn") + br_m_all = Event("ex_ret_brn_misp") + br_clr = Event("ex_ret_brn_cond_misp", + "ex_ret_msprd_brnch_instr_dir_msmtch", + "ex_ret_brn_resync") + + br_r = d_ratio(br, interval_sec) + ins_r = d_ratio(ins, br) + misp_r = d_ratio(br_m_all, br) + clr_r = d_ratio(br_clr, interval_sec) + + return MetricGroup("lpm_br_total", [ + Metric("lpm_br_total_retired", + "The number of branch instructions retired per second.", br_r, + "insn/s"), + Metric( + "lpm_br_total_mispred", + "The number of branch instructions retired, of any type, that were " + "not correctly predicted as a percentage of all branch instrucions.", + misp_r, "100%"), + Metric("lpm_br_total_insn_between_branches", + "The number of instructions divided by the number of branches.", + ins_r, "insn"), + Metric("lpm_br_total_insn_fe_resteers", + "The number of resync branches per second.", clr_r, "req/s") + ]) + + def Taken() -> MetricGroup: + br = Event("ex_ret_brn_tkn") + br_m_tk = Event("ex_ret_brn_tkn_misp") + br_r = d_ratio(br, interval_sec) + ins_r = d_ratio(ins, br) + misp_r = d_ratio(br_m_tk, br) + return MetricGroup("lpm_br_taken", [ + Metric("lpm_br_taken_retired", + "The number of taken branches that were retired per second.", + br_r, "insn/s"), + Metric( + "lpm_br_taken_mispred", + "The number of retired taken branch instructions that were " + "mispredicted as a percentage of all taken branches.", misp_r, + "100%"), + Metric( + "lpm_br_taken_insn_between_branches", + "The number of instructions divided by the number of taken branches.", + ins_r, "insn"), + ]) + + def Conditional() -> Optional[MetricGroup]: + global _zen_model + br = Event("ex_ret_brn_cond", "ex_ret_cond") + br_r = d_ratio(br, interval_sec) + ins_r = d_ratio(ins, br) + + metrics = [ + Metric("lpm_br_cond_retired", "Retired conditional branch instructions.", + br_r, "insn/s"), + Metric("lpm_br_cond_insn_between_branches", + "The number of instructions divided by the number of conditional " + "branches.", ins_r, "insn"), + ] + if _zen_model == 2: + br_m_cond = Event("ex_ret_cond_misp") + misp_r = d_ratio(br_m_cond, br) + metrics += [ + Metric("lpm_br_cond_mispred", + "Retired conditional branch instructions mispredicted as a " + "percentage of all conditional branches.", misp_r, "100%"), + ] + + return MetricGroup("lpm_br_cond", metrics) + + def Fused() -> MetricGroup: + br = Event("ex_ret_fused_instr", "ex_ret_fus_brnch_inst") + br_r = d_ratio(br, interval_sec) + ins_r = d_ratio(ins, br) + return MetricGroup("lpm_br_cond", [ + Metric("lpm_br_fused_retired", + "Retired fused branch instructions per second.", br_r, "insn/s"), + Metric( + "lpm_br_fused_insn_between_branches", + "The number of instructions divided by the number of fused " + "branches.", ins_r, "insn"), + ]) + + def Far() -> MetricGroup: + br = Event("ex_ret_brn_far") + br_r = d_ratio(br, interval_sec) + ins_r = d_ratio(ins, br) + return MetricGroup("lpm_br_far", [ + Metric("lpm_br_far_retired", "Retired far control transfers per second.", + br_r, "insn/s"), + Metric( + "lpm_br_far_insn_between_branches", + "The number of instructions divided by the number of far branches.", + ins_r, "insn"), + ]) + + return MetricGroup("lpm_br", [Total(), Taken(), Conditional(), Fused(), Far()], + description="breakdown of retired branch instructions") + + def AmdUpc() -> Metric: ops = Event("ex_ret_ops", "ex_ret_cops") upc = d_ratio(ops, smt_cycles) @@ -83,6 +187,7 @@ def main() -> None: _zen_model = int(_args.model[6:]) all_metrics = MetricGroup("", [ + AmdBr(), AmdUpc(), Idle(), Rapl(), -- cgit v1.2.3 From e596f329668ec2b5da6ac60fc87c035c5a337d1f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:38 -0800 Subject: perf jevents: Add itlb metric group for AMD Add metrics that give an overview and details of the l1 itlb (zen1, zen2, zen3) and l2 itlb (all zens). Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index 38948f63cb52..8fb0b55074a2 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -122,6 +122,54 @@ def AmdBr(): description="breakdown of retired branch instructions") +def AmdItlb(): + global _zen_model + l2h = Event("bp_l1_tlb_miss_l2_tlb_hit", "bp_l1_tlb_miss_l2_hit") + l2m = Event("l2_itlb_misses") + l2r = l2h + l2m + + itlb_l1_mg = None + l1m = l2r + if _zen_model <= 3: + l1r = Event("ic_fw32") + l1h = max(l1r - l1m, 0) + itlb_l1_mg = MetricGroup("lpm_itlb_l1", [ + Metric("lpm_itlb_l1_hits", + "L1 ITLB hits as a perecentage of L1 ITLB accesses.", + d_ratio(l1h, l1h + l1m), "100%"), + Metric("lpm_itlb_l1_miss", + "L1 ITLB misses as a perecentage of L1 ITLB accesses.", + d_ratio(l1m, l1h + l1m), "100%"), + Metric("lpm_itlb_l1_reqs", + "The number of 32B fetch windows transferred from IC pipe to DE " + "instruction decoder per second.", d_ratio( + l1r, interval_sec), + "windows/sec"), + ]) + + return MetricGroup("lpm_itlb", [ + MetricGroup("lpm_itlb_ov", [ + Metric("lpm_itlb_ov_insn_bt_l1_miss", + "Number of instructions between l1 misses", d_ratio( + ins, l1m), "insns"), + Metric("lpm_itlb_ov_insn_bt_l2_miss", + "Number of instructions between l2 misses", d_ratio( + ins, l2m), "insns"), + ]), + itlb_l1_mg, + MetricGroup("lpm_itlb_l2", [ + Metric("lpm_itlb_l2_hits", + "L2 ITLB hits as a percentage of all L2 ITLB accesses.", + d_ratio(l2h, l2r), "100%"), + Metric("lpm_itlb_l2_miss", + "L2 ITLB misses as a percentage of all L2 ITLB accesses.", + d_ratio(l2m, l2r), "100%"), + Metric("lpm_itlb_l2_reqs", "ITLB accesses per second.", + d_ratio(l2r, interval_sec), "accesses/sec"), + ]), + ], description="Instruction TLB breakdown") + + def AmdUpc() -> Metric: ops = Event("ex_ret_ops", "ex_ret_cops") upc = d_ratio(ops, smt_cycles) @@ -188,6 +236,7 @@ def main() -> None: all_metrics = MetricGroup("", [ AmdBr(), + AmdItlb(), AmdUpc(), Idle(), Rapl(), -- cgit v1.2.3 From c4108b9509f2cc1fab66c64c2de55ac4f9ee38b4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:39 -0800 Subject: perf jevents: Add dtlb metric group for AMD Add metrics that give an overview and details of the dtlb (zen1, zen2, zen3). Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 111 +++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index 8fb0b55074a2..a4ff88de08b5 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -122,6 +122,116 @@ def AmdBr(): description="breakdown of retired branch instructions") +def AmdDtlb() -> Optional[MetricGroup]: + global _zen_model + if _zen_model >= 4: + return None + + d_dat = Event("ls_dc_accesses") if _zen_model <= 3 else None + d_h4k = Event("ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit") + d_hcoal = Event( + "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit") if _zen_model >= 2 else 0 + d_h2m = Event("ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit") + d_h1g = Event("ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit") + + d_m4k = Event("ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss") + d_mcoal = Event( + "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss") if _zen_model >= 2 else 0 + d_m2m = Event("ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss") + d_m1g = Event("ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss") + + d_w0 = Event("ls_tablewalker.dc_type0") if _zen_model <= 3 else None + d_w1 = Event("ls_tablewalker.dc_type1") if _zen_model <= 3 else None + walks = d_w0 + d_w1 + walks_r = d_ratio(walks, interval_sec) + ins_w = d_ratio(ins, walks) + l1 = d_dat + l1_r = d_ratio(l1, interval_sec) + l2_hits = d_h4k + d_hcoal + d_h2m + d_h1g + l2_miss = d_m4k + d_mcoal + d_m2m + d_m1g + l2_r = d_ratio(l2_hits + l2_miss, interval_sec) + l1_miss = l2_hits + l2_miss + walks + l1_hits = max(l1 - l1_miss, 0) + ins_l = d_ratio(ins, l1_miss) + + return MetricGroup("lpm_dtlb", [ + MetricGroup("lpm_dtlb_ov", [ + Metric("lpm_dtlb_ov_insn_bt_l1_miss", + "DTLB overview: instructions between l1 misses.", ins_l, + "insns"), + Metric("lpm_dtlb_ov_insn_bt_walks", + "DTLB overview: instructions between dtlb page table walks.", + ins_w, "insns"), + ]), + MetricGroup("lpm_dtlb_l1", [ + Metric("lpm_dtlb_l1_hits", + "DTLB L1 hits as percentage of all DTLB L1 accesses.", + d_ratio(l1_hits, l1), "100%"), + Metric("lpm_dtlb_l1_miss", + "DTLB L1 misses as percentage of all DTLB L1 accesses.", + d_ratio(l1_miss, l1), "100%"), + Metric("lpm_dtlb_l1_reqs", "DTLB L1 accesses per second.", l1_r, + "insns/s"), + ]), + MetricGroup("lpm_dtlb_l2", [ + Metric("lpm_dtlb_l2_hits", + "DTLB L2 hits as percentage of all DTLB L2 accesses.", + d_ratio(l2_hits, l2_hits + l2_miss), "100%"), + Metric("lpm_dtlb_l2_miss", + "DTLB L2 misses as percentage of all DTLB L2 accesses.", + d_ratio(l2_miss, l2_hits + l2_miss), "100%"), + Metric("lpm_dtlb_l2_reqs", "DTLB L2 accesses per second.", l2_r, + "insns/s"), + MetricGroup("lpm_dtlb_l2_4kb", [ + Metric( + "lpm_dtlb_l2_4kb_hits", + "DTLB L2 4kb page size hits as percentage of all DTLB L2 4kb " + "accesses.", d_ratio(d_h4k, d_h4k + d_m4k), "100%"), + Metric( + "lpm_dtlb_l2_4kb_miss", + "DTLB L2 4kb page size misses as percentage of all DTLB L2 4kb" + "accesses.", d_ratio(d_m4k, d_h4k + d_m4k), "100%") + ]), + MetricGroup("lpm_dtlb_l2_coalesced", [ + Metric( + "lpm_dtlb_l2_coal_hits", + "DTLB L2 coalesced page (16kb) hits as percentage of all DTLB " + "L2 coalesced accesses.", d_ratio(d_hcoal, + d_hcoal + d_mcoal), "100%"), + Metric( + "lpm_dtlb_l2_coal_miss", + "DTLB L2 coalesced page (16kb) misses as percentage of all " + "DTLB L2 coalesced accesses.", + d_ratio(d_mcoal, d_hcoal + d_mcoal), "100%") + ]), + MetricGroup("lpm_dtlb_l2_2mb", [ + Metric( + "lpm_dtlb_l2_2mb_hits", + "DTLB L2 2mb page size hits as percentage of all DTLB L2 2mb " + "accesses.", d_ratio(d_h2m, d_h2m + d_m2m), "100%"), + Metric( + "lpm_dtlb_l2_2mb_miss", + "DTLB L2 2mb page size misses as percentage of all DTLB L2 " + "accesses.", d_ratio(d_m2m, d_h2m + d_m2m), "100%") + ]), + MetricGroup("lpm_dtlb_l2_1g", [ + Metric( + "lpm_dtlb_l2_1g_hits", + "DTLB L2 1gb page size hits as percentage of all DTLB L2 1gb " + "accesses.", d_ratio(d_h1g, d_h1g + d_m1g), "100%"), + Metric( + "lpm_dtlb_l2_1g_miss", + "DTLB L2 1gb page size misses as percentage of all DTLB L2 " + "1gb accesses.", d_ratio(d_m1g, d_h1g + d_m1g), "100%") + ]), + ]), + MetricGroup("lpm_dtlb_walks", [ + Metric("lpm_dtlb_walks_reqs", "DTLB page table walks per second.", + walks_r, "walks/s"), + ]), + ], description="Data TLB metrics") + + def AmdItlb(): global _zen_model l2h = Event("bp_l1_tlb_miss_l2_tlb_hit", "bp_l1_tlb_miss_l2_hit") @@ -236,6 +346,7 @@ def main() -> None: all_metrics = MetricGroup("", [ AmdBr(), + AmdDtlb(), AmdItlb(), AmdUpc(), Idle(), -- cgit v1.2.3 From 5ecb1622d000dffb9245df8d264d9c1e7492874f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:40 -0800 Subject: perf jevents: Add uncore l3 metric group for AMD Metrics use the amd_l3 PMU for access/miss/hit information. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index a4ff88de08b5..d71997177239 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -317,6 +317,24 @@ def Rapl() -> MetricGroup: description="Processor socket power consumption estimates") +def UncoreL3(): + acc = Event("l3_lookup_state.all_coherent_accesses_to_l3", + "l3_lookup_state.all_l3_req_typs") + miss = Event("l3_lookup_state.l3_miss", + "l3_comb_clstr_state.request_miss") + acc = max(acc, miss) + hits = acc - miss + + return MetricGroup("lpm_l3", [ + Metric("lpm_l3_accesses", "L3 victim cache accesses", + d_ratio(acc, interval_sec), "accesses/sec"), + Metric("lpm_l3_hits", "L3 victim cache hit rate", + d_ratio(hits, acc), "100%"), + Metric("lpm_l3_miss", "L3 victim cache miss rate", d_ratio(miss, acc), + "100%"), + ], description="L3 cache breakdown per CCX") + + def main() -> None: global _args global _zen_model @@ -351,6 +369,7 @@ def main() -> None: AmdUpc(), Idle(), Rapl(), + UncoreL3(), ]) if _args.metricgroups: -- cgit v1.2.3 From fb4c0581740d99029761f4ad84fc0dbc48913eea Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:41 -0800 Subject: perf jevents: Add load store breakdown metrics ldst for AMD Give breakdown of number of instructions. Use the counter mask (cmask) to show the number of cycles taken to retire the instructions. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 75 ++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index d71997177239..b3de74babe40 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -280,6 +280,80 @@ def AmdItlb(): ], description="Instruction TLB breakdown") +def AmdLdSt() -> MetricGroup: + ldst_ld = Event("ls_dispatch.pure_ld", "ls_dispatch.ld_dispatch") + ldst_st = Event("ls_dispatch.pure_st", "ls_dispatch.store_dispatch") + ldst_ldc1 = Event(f"{ldst_ld}/cmask=1/") + ldst_stc1 = Event(f"{ldst_st}/cmask=1/") + ldst_ldc2 = Event(f"{ldst_ld}/cmask=2/") + ldst_stc2 = Event(f"{ldst_st}/cmask=2/") + ldst_ldc3 = Event(f"{ldst_ld}/cmask=3/") + ldst_stc3 = Event(f"{ldst_st}/cmask=3/") + ldst_cyc = Event("ls_not_halted_cyc") + + ld_rate = d_ratio(ldst_ld, interval_sec) + st_rate = d_ratio(ldst_st, interval_sec) + + ld_v1 = max(ldst_ldc1 - ldst_ldc2, 0) + ld_v2 = max(ldst_ldc2 - ldst_ldc3, 0) + ld_v3 = ldst_ldc3 + + st_v1 = max(ldst_stc1 - ldst_stc2, 0) + st_v2 = max(ldst_stc2 - ldst_stc3, 0) + st_v3 = ldst_stc3 + + return MetricGroup("lpm_ldst", [ + MetricGroup("lpm_ldst_total", [ + Metric("lpm_ldst_total_ld", "Number of loads dispatched per second.", + ld_rate, "insns/sec"), + Metric("lpm_ldst_total_st", "Number of stores dispatched per second.", + st_rate, "insns/sec"), + ]), + MetricGroup("lpm_ldst_percent_insn", [ + Metric("lpm_ldst_percent_insn_ld", + "Load instructions as a percentage of all instructions.", + d_ratio(ldst_ld, ins), "100%"), + Metric("lpm_ldst_percent_insn_st", + "Store instructions as a percentage of all instructions.", + d_ratio(ldst_st, ins), "100%"), + ]), + MetricGroup("lpm_ldst_ret_loads_per_cycle", [ + Metric( + "lpm_ldst_ret_loads_per_cycle_1", + "Load instructions retiring in 1 cycle as a percentage of all " + "unhalted cycles.", d_ratio(ld_v1, ldst_cyc), "100%"), + Metric( + "lpm_ldst_ret_loads_per_cycle_2", + "Load instructions retiring in 2 cycles as a percentage of all " + "unhalted cycles.", d_ratio(ld_v2, ldst_cyc), "100%"), + Metric( + "lpm_ldst_ret_loads_per_cycle_3", + "Load instructions retiring in 3 or more cycles as a percentage" + "of all unhalted cycles.", d_ratio(ld_v3, ldst_cyc), "100%"), + ]), + MetricGroup("lpm_ldst_ret_stores_per_cycle", [ + Metric( + "lpm_ldst_ret_stores_per_cycle_1", + "Store instructions retiring in 1 cycle as a percentage of all " + "unhalted cycles.", d_ratio(st_v1, ldst_cyc), "100%"), + Metric( + "lpm_ldst_ret_stores_per_cycle_2", + "Store instructions retiring in 2 cycles as a percentage of all " + "unhalted cycles.", d_ratio(st_v2, ldst_cyc), "100%"), + Metric( + "lpm_ldst_ret_stores_per_cycle_3", + "Store instructions retiring in 3 or more cycles as a percentage" + "of all unhalted cycles.", d_ratio(st_v3, ldst_cyc), "100%"), + ]), + MetricGroup("lpm_ldst_insn_bt", [ + Metric("lpm_ldst_insn_bt_ld", "Number of instructions between loads.", + d_ratio(ins, ldst_ld), "insns"), + Metric("lpm_ldst_insn_bt_st", "Number of instructions between stores.", + d_ratio(ins, ldst_st), "insns"), + ]) + ], description="Breakdown of load/store instructions") + + def AmdUpc() -> Metric: ops = Event("ex_ret_ops", "ex_ret_cops") upc = d_ratio(ops, smt_cycles) @@ -366,6 +440,7 @@ def main() -> None: AmdBr(), AmdDtlb(), AmdItlb(), + AmdLdSt(), AmdUpc(), Idle(), Rapl(), -- cgit v1.2.3 From 3563030d4f77856bd0a32492fbe9edc3d36248ec Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:42 -0800 Subject: perf jevents: Add context switch metrics for AMD Metrics break down context switches for different kinds of instruction. Reviewed-by: Sandipan Das Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Thomas Falcon Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/amd_metrics.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index b3de74babe40..83e77ccc059e 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -122,6 +122,38 @@ def AmdBr(): description="breakdown of retired branch instructions") +def AmdCtxSw() -> MetricGroup: + cs = Event("context\\-switches") + metrics = [ + Metric("lpm_cs_rate", "Context switches per second", + d_ratio(cs, interval_sec), "ctxsw/s") + ] + + ev = Event("instructions") + metrics.append(Metric("lpm_cs_instr", "Instructions per context switch", + d_ratio(ev, cs), "instr/cs")) + + ev = Event("cycles") + metrics.append(Metric("lpm_cs_cycles", "Cycles per context switch", + d_ratio(ev, cs), "cycles/cs")) + + ev = Event("ls_dispatch.pure_ld", "ls_dispatch.ld_dispatch") + metrics.append(Metric("lpm_cs_loads", "Loads per context switch", + d_ratio(ev, cs), "loads/cs")) + + ev = Event("ls_dispatch.pure_st", "ls_dispatch.store_dispatch") + metrics.append(Metric("lpm_cs_stores", "Stores per context switch", + d_ratio(ev, cs), "stores/cs")) + + ev = Event("ex_ret_brn_tkn") + metrics.append(Metric("lpm_cs_br_taken", "Branches taken per context switch", + d_ratio(ev, cs), "br_taken/cs")) + + return MetricGroup("lpm_cs", metrics, + description=("Number of context switches per second, instructions " + "retired & core cycles between context switches")) + + def AmdDtlb() -> Optional[MetricGroup]: global _zen_model if _zen_model >= 4: @@ -438,6 +470,7 @@ def main() -> None: all_metrics = MetricGroup("", [ AmdBr(), + AmdCtxSw(), AmdDtlb(), AmdItlb(), AmdLdSt(), -- cgit v1.2.3 From bab90b3b46cd74d5f562b0a7b2bef1222e8960af Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:43 -0800 Subject: perf jevents: Add RAPL metrics for all Intel models Add a 'cpu_power' metric group that computes the power consumption from RAPL events if they are present. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 44 +++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index b287ef115193..61778deedfff 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -1,12 +1,48 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) import argparse +import math import os -from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, - MetricGroup) +from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, + LoadEvents, Metric, MetricGroup, Select) # Global command line arguments. _args = None +interval_sec = Event("duration_time") + + +def Rapl() -> MetricGroup: + """Processor power consumption estimate. + + Use events from the running average power limit (RAPL) driver. + """ + # Watts = joules/second + pkg = Event("power/energy\\-pkg/") + cond_pkg = Select(pkg, has_event(pkg), math.nan) + cores = Event("power/energy\\-cores/") + cond_cores = Select(cores, has_event(cores), math.nan) + ram = Event("power/energy\\-ram/") + cond_ram = Select(ram, has_event(ram), math.nan) + gpu = Event("power/energy\\-gpu/") + cond_gpu = Select(gpu, has_event(gpu), math.nan) + psys = Event("power/energy\\-psys/") + cond_psys = Select(psys, has_event(psys), math.nan) + scale = 2.3283064365386962890625e-10 + metrics = [ + Metric("lpm_cpu_power_pkg", "", + d_ratio(cond_pkg * scale, interval_sec), "Watts"), + Metric("lpm_cpu_power_cores", "", + d_ratio(cond_cores * scale, interval_sec), "Watts"), + Metric("lpm_cpu_power_ram", "", + d_ratio(cond_ram * scale, interval_sec), "Watts"), + Metric("lpm_cpu_power_gpu", "", + d_ratio(cond_gpu * scale, interval_sec), "Watts"), + Metric("lpm_cpu_power_psys", "", + d_ratio(cond_psys * scale, interval_sec), "Watts"), + ] + + return MetricGroup("lpm_cpu_power", metrics, + description="Running Average Power Limit (RAPL) power consumption estimates") def main() -> None: @@ -33,7 +69,9 @@ def main() -> None: directory = f"{_args.events_path}/x86/{_args.model}/" LoadEvents(directory) - all_metrics = MetricGroup("", []) + all_metrics = MetricGroup("", [ + Rapl(), + ]) if _args.metricgroups: print(JsonEncodeMetricGroupDescriptions(all_metrics)) -- cgit v1.2.3 From 1d519e5aa8ee025a13a822ed87fa6c9f249c63b1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:44 -0800 Subject: perf jevents: Add idle metric for Intel models Compute using the msr PMU the percentage of wallclock cycles where the CPUs are in a low power state. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 61778deedfff..0cb7a38ad238 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -3,14 +3,25 @@ import argparse import math import os -from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, - LoadEvents, Metric, MetricGroup, Select) +from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric, + JsonEncodeMetricGroupDescriptions, LoadEvents, Metric, + MetricGroup, Select) # Global command line arguments. _args = None interval_sec = Event("duration_time") +def Idle() -> Metric: + cyc = Event("msr/mperf/") + tsc = Event("msr/tsc/") + low = max(tsc - cyc, 0) + return Metric( + "lpm_idle", + "Percentage of total wallclock cycles where CPUs are in low power state (C1 or deeper sleep state)", + d_ratio(low, tsc), "100%") + + def Rapl() -> MetricGroup: """Processor power consumption estimate. @@ -70,6 +81,7 @@ def main() -> None: LoadEvents(directory) all_metrics = MetricGroup("", [ + Idle(), Rapl(), ]) -- cgit v1.2.3 From 61b7b2ef64f8c9cf48ae1b0bfe2ee0bcb9bb3181 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:45 -0800 Subject: perf jevents: Add CheckPmu to see if a PMU is in loaded JSON events CheckPmu can be used to determine if hybrid events are present, allowing for hybrid conditional metrics/events/pmus to be premised on the JSON files rather than hard coded tables. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/metric.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py index e33e163b2815..62d1a1e1d458 100644 --- a/tools/perf/pmu-events/metric.py +++ b/tools/perf/pmu-events/metric.py @@ -8,10 +8,12 @@ import re from enum import Enum from typing import Dict, List, Optional, Set, Tuple, Union +all_pmus = set() all_events = set() def LoadEvents(directory: str) -> None: """Populate a global set of all known events for the purpose of validating Event names""" + global all_pmus global all_events all_events = { "context\\-switches", @@ -26,6 +28,8 @@ def LoadEvents(directory: str) -> None: if filename.endswith(".json"): try: for x in json.load(open(f"{directory}/{filename}")): + if "Unit" in x: + all_pmus.add(x["Unit"]) if "EventName" in x: all_events.add(x["EventName"]) elif "ArchStdEvent" in x: @@ -36,6 +40,10 @@ def LoadEvents(directory: str) -> None: pass +def CheckPmu(name: str) -> bool: + return name in all_pmus + + def CheckEvent(name: str) -> bool: """Check the event name exists in the set of all loaded events""" global all_events -- cgit v1.2.3 From 17d616b7d98dcc15561b83a7e1c78f304b8cea74 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:46 -0800 Subject: perf jevents: Add smi metric group for Intel models Allow duplicated metric to be dropped from JSON files. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 0cb7a38ad238..94604b1b07d8 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -3,9 +3,9 @@ import argparse import math import os -from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric, +from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, Metric, - MetricGroup, Select) + MetricGroup, MetricRef, Select) # Global command line arguments. _args = None @@ -56,6 +56,25 @@ def Rapl() -> MetricGroup: description="Running Average Power Limit (RAPL) power consumption estimates") +def Smi() -> MetricGroup: + pmu = "" if CheckPmu("cpu_core") else "cpu" + aperf = Event('msr/aperf/') + cycles = Event('cycles') + smi_num = Event('msr/smi/') + smi_cycles = Select(Select((aperf - cycles) / aperf, smi_num > 0, 0), + has_event(aperf), + 0) + return MetricGroup('smi', [ + Metric('smi_num', 'Number of SMI interrupts.', + Select(smi_num, has_event(smi_num), 0), 'SMI#'), + # Note, the smi_cycles "Event" is really a reference to the metric. + Metric('smi_cycles', + 'Percentage of cycles spent in System Management Interrupts. ' + f'Requires /sys/bus/event_source/devices/{pmu}/freeze_on_smi to be 1.', + smi_cycles, '100%', threshold=(MetricRef('smi_cycles') > 0.10)) + ], description='System Management Interrupt metrics') + + def main() -> None: global _args @@ -83,6 +102,7 @@ def main() -> None: all_metrics = MetricGroup("", [ Idle(), Rapl(), + Smi(), ]) if _args.metricgroups: -- cgit v1.2.3 From 7eb9fa417c0218fd4e3b8d18894d7b89c15fb8ba Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:47 -0800 Subject: perf jevents: Mark metrics with experimental events as experimental When metrics are made with experimental events it is desirable the metric description also carries this information in case of metric inaccuracies. Suggested-by: Perry Taylor Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/metric.py | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py index 62d1a1e1d458..2029b6e28365 100644 --- a/tools/perf/pmu-events/metric.py +++ b/tools/perf/pmu-events/metric.py @@ -10,11 +10,13 @@ from typing import Dict, List, Optional, Set, Tuple, Union all_pmus = set() all_events = set() +experimental_events = set() def LoadEvents(directory: str) -> None: """Populate a global set of all known events for the purpose of validating Event names""" global all_pmus global all_events + global experimental_events all_events = { "context\\-switches", "cpu\\-cycles", @@ -32,6 +34,8 @@ def LoadEvents(directory: str) -> None: all_pmus.add(x["Unit"]) if "EventName" in x: all_events.add(x["EventName"]) + if "Experimental" in x and x["Experimental"] == "1": + experimental_events.add(x["EventName"]) elif "ArchStdEvent" in x: all_events.add(x["ArchStdEvent"]) except json.decoder.JSONDecodeError: @@ -61,6 +65,18 @@ def CheckEvent(name: str) -> bool: return name in all_events +def IsExperimentalEvent(name: str) -> bool: + global experimental_events + if ':' in name: + # Remove trailing modifier. + name = name[:name.find(':')] + elif '/' in name: + # Name could begin with a PMU or an event, for now assume it is not experimental. + return False + + return name in experimental_events + + class MetricConstraint(Enum): GROUPED_EVENTS = 0 NO_GROUP_EVENTS = 1 @@ -82,6 +98,10 @@ class Expression: """Returns a simplified version of self.""" raise NotImplementedError() + def HasExperimentalEvents(self) -> bool: + """Are experimental events used in the expression?""" + raise NotImplementedError() + def Equals(self, other) -> bool: """Returns true when two expressions are the same.""" raise NotImplementedError() @@ -249,6 +269,9 @@ class Operator(Expression): return Operator(self.operator, lhs, rhs) + def HasExperimentalEvents(self) -> bool: + return self.lhs.HasExperimentalEvents() or self.rhs.HasExperimentalEvents() + def Equals(self, other: Expression) -> bool: if isinstance(other, Operator): return self.operator == other.operator and self.lhs.Equals( @@ -297,6 +320,10 @@ class Select(Expression): return Select(true_val, cond, false_val) + def HasExperimentalEvents(self) -> bool: + return (self.cond.HasExperimentalEvents() or self.true_val.HasExperimentalEvents() or + self.false_val.HasExperimentalEvents()) + def Equals(self, other: Expression) -> bool: if isinstance(other, Select): return self.cond.Equals(other.cond) and self.false_val.Equals( @@ -345,6 +372,9 @@ class Function(Expression): return Function(self.fn, lhs, rhs) + def HasExperimentalEvents(self) -> bool: + return self.lhs.HasExperimentalEvents() or (self.rhs and self.rhs.HasExperimentalEvents()) + def Equals(self, other: Expression) -> bool: if isinstance(other, Function): result = self.fn == other.fn and self.lhs.Equals(other.lhs) @@ -384,6 +414,9 @@ class Event(Expression): global all_events raise Exception(f"No event {error} in:\n{all_events}") + def HasExperimentalEvents(self) -> bool: + return IsExperimentalEvent(self.name) + def ToPerfJson(self): result = re.sub('/', '@', self.name) return result @@ -416,6 +449,9 @@ class MetricRef(Expression): def Simplify(self) -> Expression: return self + def HasExperimentalEvents(self) -> bool: + return False + def Equals(self, other: Expression) -> bool: return isinstance(other, MetricRef) and self.name == other.name @@ -443,6 +479,9 @@ class Constant(Expression): def Simplify(self) -> Expression: return self + def HasExperimentalEvents(self) -> bool: + return False + def Equals(self, other: Expression) -> bool: return isinstance(other, Constant) and self.value == other.value @@ -465,6 +504,9 @@ class Literal(Expression): def Simplify(self) -> Expression: return self + def HasExperimentalEvents(self) -> bool: + return False + def Equals(self, other: Expression) -> bool: return isinstance(other, Literal) and self.value == other.value @@ -527,6 +569,8 @@ class Metric: self.name = name self.description = description self.expr = expr.Simplify() + if self.expr.HasExperimentalEvents(): + self.description += " (metric should be considered experimental as it contains experimental events)." # Workraound valid_only_metric hiding certain metrics based on unit. scale_unit = scale_unit.replace('/sec', ' per sec') if scale_unit[0].isdigit(): -- cgit v1.2.3 From 8c345f35003269eaceeb84b3b14588aeb2bae6f7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:48 -0800 Subject: perf jevents: Add tsx metric group for Intel models Allow duplicated metric to be dropped from JSON files. Detect when TSX is supported by a model by using the JSON events, use sysfs events at runtime as hypervisors, etc. may disable TSX. Add CheckPmu to metric to determine if which PMUs have been associated with the loaded events. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 94604b1b07d8..05f3d94ec5d5 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -3,6 +3,7 @@ import argparse import math import os +from typing import Optional from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, Metric, MetricGroup, MetricRef, Select) @@ -75,6 +76,54 @@ def Smi() -> MetricGroup: ], description='System Management Interrupt metrics') +def Tsx() -> Optional[MetricGroup]: + pmu = "cpu_core" if CheckPmu("cpu_core") else "cpu" + cycles = Event('cycles') + cycles_in_tx = Event(f'{pmu}/cycles\\-t/') + cycles_in_tx_cp = Event(f'{pmu}/cycles\\-ct/') + try: + # Test if the tsx event is present in the json, prefer the + # sysfs version so that we can detect its presence at runtime. + transaction_start = Event("RTM_RETIRED.START") + transaction_start = Event(f'{pmu}/tx\\-start/') + except: + return None + + elision_start = None + try: + # Elision start isn't supported by all models, but we'll not + # generate the tsx_cycles_per_elision metric in that + # case. Again, prefer the sysfs encoding of the event. + elision_start = Event("HLE_RETIRED.START") + elision_start = Event(f'{pmu}/el\\-start/') + except: + pass + + return MetricGroup('transaction', [ + Metric('tsx_transactional_cycles', + 'Percentage of cycles within a transaction region.', + Select(cycles_in_tx / cycles, has_event(cycles_in_tx), 0), + '100%'), + Metric('tsx_aborted_cycles', 'Percentage of cycles in aborted transactions.', + Select(max(cycles_in_tx - cycles_in_tx_cp, 0) / cycles, + has_event(cycles_in_tx), + 0), + '100%'), + Metric('tsx_cycles_per_transaction', + 'Number of cycles within a transaction divided by the number of transactions.', + Select(cycles_in_tx / transaction_start, + has_event(cycles_in_tx), + 0), + "cycles / transaction"), + Metric('tsx_cycles_per_elision', + 'Number of cycles within a transaction divided by the number of elisions.', + Select(cycles_in_tx / elision_start, + has_event(elision_start), + 0), + "cycles / elision") if elision_start else None, + ], description="Breakdown of transactional memory statistics") + + def main() -> None: global _args @@ -103,6 +152,7 @@ def main() -> None: Idle(), Rapl(), Smi(), + Tsx(), ]) if _args.metricgroups: -- cgit v1.2.3 From 37d0b00a1ac85309e63700153049bc16fc446b19 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:49 -0800 Subject: perf jevents: Add br metric group for branch statistics on Intel The br metric group for branches itself comprises metric groups for total, taken, conditional, fused and far metric groups using JSON events. Conditional taken and not taken metrics are specific to Icelake and later generations, so the presence of the event is used to determine whether the metric should exist. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 138 +++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 05f3d94ec5d5..e1944d821248 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -124,6 +124,143 @@ def Tsx() -> Optional[MetricGroup]: ], description="Breakdown of transactional memory statistics") +def IntelBr(): + ins = Event("instructions") + + def Total() -> MetricGroup: + br_all = Event("BR_INST_RETIRED.ALL_BRANCHES", "BR_INST_RETIRED.ANY") + br_m_all = Event("BR_MISP_RETIRED.ALL_BRANCHES", + "BR_INST_RETIRED.MISPRED", + "BR_MISP_EXEC.ANY") + br_clr = None + try: + br_clr = Event("BACLEARS.ANY", "BACLEARS.ALL") + except: + pass + + br_r = d_ratio(br_all, interval_sec) + ins_r = d_ratio(ins, br_all) + misp_r = d_ratio(br_m_all, br_all) + clr_r = d_ratio(br_clr, interval_sec) if br_clr else None + + return MetricGroup("lpm_br_total", [ + Metric("lpm_br_total_retired", + "The number of branch instructions retired per second.", br_r, + "insn/s"), + Metric( + "lpm_br_total_mispred", + "The number of branch instructions retired, of any type, that were " + "not correctly predicted as a percentage of all branch instrucions.", + misp_r, "100%"), + Metric("lpm_br_total_insn_between_branches", + "The number of instructions divided by the number of branches.", + ins_r, "insn"), + Metric("lpm_br_total_insn_fe_resteers", + "The number of resync branches per second.", clr_r, "req/s" + ) if clr_r else None + ]) + + def Taken() -> MetricGroup: + br_all = Event("BR_INST_RETIRED.ALL_BRANCHES", "BR_INST_RETIRED.ANY") + br_m_tk = None + try: + br_m_tk = Event("BR_MISP_RETIRED.NEAR_TAKEN", + "BR_MISP_RETIRED.TAKEN_JCC", + "BR_INST_RETIRED.MISPRED_TAKEN") + except: + pass + br_r = d_ratio(br_all, interval_sec) + ins_r = d_ratio(ins, br_all) + misp_r = d_ratio(br_m_tk, br_all) if br_m_tk else None + return MetricGroup("lpm_br_taken", [ + Metric("lpm_br_taken_retired", + "The number of taken branches that were retired per second.", + br_r, "insn/s"), + Metric( + "lpm_br_taken_mispred", + "The number of retired taken branch instructions that were " + "mispredicted as a percentage of all taken branches.", misp_r, + "100%") if misp_r else None, + Metric( + "lpm_br_taken_insn_between_branches", + "The number of instructions divided by the number of taken branches.", + ins_r, "insn"), + ]) + + def Conditional() -> Optional[MetricGroup]: + try: + br_cond = Event("BR_INST_RETIRED.COND", + "BR_INST_RETIRED.CONDITIONAL", + "BR_INST_RETIRED.TAKEN_JCC") + br_m_cond = Event("BR_MISP_RETIRED.COND", + "BR_MISP_RETIRED.CONDITIONAL", + "BR_MISP_RETIRED.TAKEN_JCC") + except: + return None + + br_cond_nt = None + br_m_cond_nt = None + try: + br_cond_nt = Event("BR_INST_RETIRED.COND_NTAKEN") + br_m_cond_nt = Event("BR_MISP_RETIRED.COND_NTAKEN") + except: + pass + br_r = d_ratio(br_cond, interval_sec) + ins_r = d_ratio(ins, br_cond) + misp_r = d_ratio(br_m_cond, br_cond) + taken_metrics = [ + Metric("lpm_br_cond_retired", "Retired conditional branch instructions.", + br_r, "insn/s"), + Metric("lpm_br_cond_insn_between_branches", + "The number of instructions divided by the number of conditional " + "branches.", ins_r, "insn"), + Metric("lpm_br_cond_mispred", + "Retired conditional branch instructions mispredicted as a " + "percentage of all conditional branches.", misp_r, "100%"), + ] + if not br_m_cond_nt: + return MetricGroup("lpm_br_cond", taken_metrics) + + br_r = d_ratio(br_cond_nt, interval_sec) + ins_r = d_ratio(ins, br_cond_nt) + misp_r = d_ratio(br_m_cond_nt, br_cond_nt) + + not_taken_metrics = [ + Metric("lpm_br_cond_retired", "Retired conditional not taken branch instructions.", + br_r, "insn/s"), + Metric("lpm_br_cond_insn_between_branches", + "The number of instructions divided by the number of not taken conditional " + "branches.", ins_r, "insn"), + Metric("lpm_br_cond_mispred", + "Retired not taken conditional branch instructions mispredicted as a " + "percentage of all not taken conditional branches.", misp_r, "100%"), + ] + return MetricGroup("lpm_br_cond", [ + MetricGroup("lpm_br_cond_nt", not_taken_metrics), + MetricGroup("lpm_br_cond_tkn", taken_metrics), + ]) + + def Far() -> Optional[MetricGroup]: + try: + br_far = Event("BR_INST_RETIRED.FAR_BRANCH") + except: + return None + + br_r = d_ratio(br_far, interval_sec) + ins_r = d_ratio(ins, br_far) + return MetricGroup("lpm_br_far", [ + Metric("lpm_br_far_retired", "Retired far control transfers per second.", + br_r, "insn/s"), + Metric( + "lpm_br_far_insn_between_branches", + "The number of instructions divided by the number of far branches.", + ins_r, "insn"), + ]) + + return MetricGroup("lpm_br", [Total(), Taken(), Conditional(), Far()], + description="breakdown of retired branch instructions") + + def main() -> None: global _args @@ -153,6 +290,7 @@ def main() -> None: Rapl(), Smi(), Tsx(), + IntelBr(), ]) if _args.metricgroups: -- cgit v1.2.3 From 397fdb3a24435f55c4c675726d1b214954e7aa53 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:50 -0800 Subject: perf jevents: Add software prefetch (swpf) metric group for Intel Add metrics that breakdown software prefetch instruction use. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index e1944d821248..919a058c343a 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -261,6 +261,71 @@ def IntelBr(): description="breakdown of retired branch instructions") +def IntelSwpf() -> Optional[MetricGroup]: + ins = Event("instructions") + try: + s_ld = Event("MEM_INST_RETIRED.ALL_LOADS", + "MEM_UOPS_RETIRED.ALL_LOADS") + s_nta = Event("SW_PREFETCH_ACCESS.NTA") + s_t0 = Event("SW_PREFETCH_ACCESS.T0") + s_t1 = Event("SW_PREFETCH_ACCESS.T1_T2") + s_w = Event("SW_PREFETCH_ACCESS.PREFETCHW") + except: + return None + + all_sw = s_nta + s_t0 + s_t1 + s_w + swp_r = d_ratio(all_sw, interval_sec) + ins_r = d_ratio(ins, all_sw) + ld_r = d_ratio(s_ld, all_sw) + + return MetricGroup("lpm_swpf", [ + MetricGroup("lpm_swpf_totals", [ + Metric("lpm_swpf_totals_exec", "Software prefetch instructions per second", + swp_r, "swpf/s"), + Metric("lpm_swpf_totals_insn_per_pf", + "Average number of instructions between software prefetches", + ins_r, "insn/swpf"), + Metric("lpm_swpf_totals_loads_per_pf", + "Average number of loads between software prefetches", + ld_r, "loads/swpf"), + ]), + MetricGroup("lpm_swpf_bkdwn", [ + MetricGroup("lpm_swpf_bkdwn_nta", [ + Metric("lpm_swpf_bkdwn_nta_per_swpf", + "Software prefetch NTA instructions as a percent of all prefetch instructions", + d_ratio(s_nta, all_sw), "100%"), + Metric("lpm_swpf_bkdwn_nta_rate", + "Software prefetch NTA instructions per second", + d_ratio(s_nta, interval_sec), "insn/s"), + ]), + MetricGroup("lpm_swpf_bkdwn_t0", [ + Metric("lpm_swpf_bkdwn_t0_per_swpf", + "Software prefetch T0 instructions as a percent of all prefetch instructions", + d_ratio(s_t0, all_sw), "100%"), + Metric("lpm_swpf_bkdwn_t0_rate", + "Software prefetch T0 instructions per second", + d_ratio(s_t0, interval_sec), "insn/s"), + ]), + MetricGroup("lpm_swpf_bkdwn_t1_t2", [ + Metric("lpm_swpf_bkdwn_t1_t2_per_swpf", + "Software prefetch T1 or T2 instructions as a percent of all prefetch instructions", + d_ratio(s_t1, all_sw), "100%"), + Metric("lpm_swpf_bkdwn_t1_t2_rate", + "Software prefetch T1 or T2 instructions per second", + d_ratio(s_t1, interval_sec), "insn/s"), + ]), + MetricGroup("lpm_swpf_bkdwn_w", [ + Metric("lpm_swpf_bkdwn_w_per_swpf", + "Software prefetch W instructions as a percent of all prefetch instructions", + d_ratio(s_w, all_sw), "100%"), + Metric("lpm_swpf_bkdwn_w_rate", + "Software prefetch W instructions per second", + d_ratio(s_w, interval_sec), "insn/s"), + ]), + ]), + ], description="Software prefetch instruction breakdown") + + def main() -> None: global _args @@ -291,6 +356,7 @@ def main() -> None: Smi(), Tsx(), IntelBr(), + IntelSwpf(), ]) if _args.metricgroups: -- cgit v1.2.3 From cd1c6a487407a350970d1bea9d3e674f2a281179 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:51 -0800 Subject: perf jevents: Add ports metric group giving utilization on Intel The ports metric group contains a metric for each port giving its utilization as a ratio of cycles. The metrics are created by looking for UOPS_DISPATCHED.PORT events. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 35 ++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 919a058c343a..7fcc0a1c544d 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -1,12 +1,14 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) import argparse +import json import math import os +import re from typing import Optional from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric, - JsonEncodeMetricGroupDescriptions, LoadEvents, Metric, - MetricGroup, MetricRef, Select) + JsonEncodeMetricGroupDescriptions, Literal, LoadEvents, + Metric, MetricGroup, MetricRef, Select) # Global command line arguments. _args = None @@ -261,6 +263,34 @@ def IntelBr(): description="breakdown of retired branch instructions") +def IntelPorts() -> Optional[MetricGroup]: + pipeline_events = json.load( + open(f"{_args.events_path}/x86/{_args.model}/pipeline.json")) + + core_cycles = Event("CPU_CLK_UNHALTED.THREAD_P_ANY", + "CPU_CLK_UNHALTED.DISTRIBUTED", + "cycles") + # Number of CPU cycles scaled for SMT. + smt_cycles = Select(core_cycles / 2, Literal("#smt_on"), core_cycles) + + metrics = [] + for x in pipeline_events: + if "EventName" in x and re.search("^UOPS_DISPATCHED.PORT", x["EventName"]): + name = x["EventName"] + port = re.search(r"(PORT_[0-9].*)", name).group(0).lower() + if name.endswith("_CORE"): + cyc = core_cycles + else: + cyc = smt_cycles + metrics.append(Metric(f"lpm_{port}", f"{port} utilization (higher is better)", + d_ratio(Event(name), cyc), "100%")) + if len(metrics) == 0: + return None + + return MetricGroup("lpm_ports", metrics, "functional unit (port) utilization -- " + "fraction of cycles each port is utilized (higher is better)") + + def IntelSwpf() -> Optional[MetricGroup]: ins = Event("instructions") try: @@ -356,6 +386,7 @@ def main() -> None: Smi(), Tsx(), IntelBr(), + IntelPorts(), IntelSwpf(), ]) -- cgit v1.2.3 From 7413633e255cb02d59e9451e55fbe9e50310db18 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:52 -0800 Subject: perf jevents: Add L2 metrics for Intel Give a breakdown of various L2 counters as metrics, including totals, reads, hardware prefetcher, RFO, code and evictions. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 170 +++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 7fcc0a1c544d..d190d97f4aff 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -263,6 +263,175 @@ def IntelBr(): description="breakdown of retired branch instructions") +def IntelL2() -> Optional[MetricGroup]: + try: + DC_HIT = Event("L2_RQSTS.DEMAND_DATA_RD_HIT") + except: + return None + try: + DC_MISS = Event("L2_RQSTS.DEMAND_DATA_RD_MISS") + l2_dmnd_miss = DC_MISS + l2_dmnd_rd_all = DC_MISS + DC_HIT + except: + DC_ALL = Event("L2_RQSTS.ALL_DEMAND_DATA_RD") + l2_dmnd_miss = DC_ALL - DC_HIT + l2_dmnd_rd_all = DC_ALL + l2_dmnd_mrate = d_ratio(l2_dmnd_miss, interval_sec) + l2_dmnd_rrate = d_ratio(l2_dmnd_rd_all, interval_sec) + + DC_PFH = None + DC_PFM = None + l2_pf_all = None + l2_pf_mrate = None + l2_pf_rrate = None + try: + DC_PFH = Event("L2_RQSTS.PF_HIT") + DC_PFM = Event("L2_RQSTS.PF_MISS") + l2_pf_all = DC_PFH + DC_PFM + l2_pf_mrate = d_ratio(DC_PFM, interval_sec) + l2_pf_rrate = d_ratio(l2_pf_all, interval_sec) + except: + pass + + DC_RFOH = None + DC_RFOM = None + l2_rfo_all = None + l2_rfo_mrate = None + l2_rfo_rrate = None + try: + DC_RFOH = Event("L2_RQSTS.RFO_HIT") + DC_RFOM = Event("L2_RQSTS.RFO_MISS") + l2_rfo_all = DC_RFOH + DC_RFOM + l2_rfo_mrate = d_ratio(DC_RFOM, interval_sec) + l2_rfo_rrate = d_ratio(l2_rfo_all, interval_sec) + except: + pass + + DC_CH = None + try: + DC_CH = Event("L2_RQSTS.CODE_RD_HIT") + except: + pass + DC_CM = Event("L2_RQSTS.CODE_RD_MISS") + DC_IN = Event("L2_LINES_IN.ALL") + DC_OUT_NS = None + DC_OUT_S = None + l2_lines_out = None + l2_out_rate = None + wbn = None + isd = None + try: + DC_OUT_NS = Event("L2_LINES_OUT.NON_SILENT", + "L2_LINES_OUT.DEMAND_DIRTY", + "L2_LINES_IN.S") + DC_OUT_S = Event("L2_LINES_OUT.SILENT", + "L2_LINES_OUT.DEMAND_CLEAN", + "L2_LINES_IN.I") + if DC_OUT_S.name == "L2_LINES_OUT.SILENT" and ( + args.model.startswith("skylake") or + args.model == "cascadelakex"): + DC_OUT_S.name = "L2_LINES_OUT.SILENT/any/" + # bring is back to per-CPU + l2_s = Select(DC_OUT_S / 2, Literal("#smt_on"), DC_OUT_S) + l2_ns = DC_OUT_NS + l2_lines_out = l2_s + l2_ns + l2_out_rate = d_ratio(l2_lines_out, interval_sec) + nlr = max(l2_ns - DC_WB_U - DC_WB_D, 0) + wbn = d_ratio(nlr, interval_sec) + isd = d_ratio(l2_s, interval_sec) + except: + pass + DC_OUT_U = None + l2_pf_useless = None + l2_useless_rate = None + try: + DC_OUT_U = Event("L2_LINES_OUT.USELESS_HWPF") + l2_pf_useless = DC_OUT_U + l2_useless_rate = d_ratio(l2_pf_useless, interval_sec) + except: + pass + DC_WB_U = None + DC_WB_D = None + wbu = None + wbd = None + try: + DC_WB_U = Event("IDI_MISC.WB_UPGRADE") + DC_WB_D = Event("IDI_MISC.WB_DOWNGRADE") + wbu = d_ratio(DC_WB_U, interval_sec) + wbd = d_ratio(DC_WB_D, interval_sec) + except: + pass + + l2_lines_in = DC_IN + l2_code_all = (DC_CH + DC_CM) if DC_CH else None + l2_code_rate = d_ratio(l2_code_all, interval_sec) if DC_CH else None + l2_code_miss_rate = d_ratio(DC_CM, interval_sec) + l2_in_rate = d_ratio(l2_lines_in, interval_sec) + + return MetricGroup("lpm_l2", [ + MetricGroup("lpm_l2_totals", [ + Metric("lpm_l2_totals_in", "L2 cache total in per second", + l2_in_rate, "In/s"), + Metric("lpm_l2_totals_out", "L2 cache total out per second", + l2_out_rate, "Out/s") if l2_out_rate else None, + ]), + MetricGroup("lpm_l2_rd", [ + Metric("lpm_l2_rd_hits", "L2 cache data read hits", + d_ratio(DC_HIT, l2_dmnd_rd_all), "100%"), + Metric("lpm_l2_rd_hits", "L2 cache data read hits", + d_ratio(l2_dmnd_miss, l2_dmnd_rd_all), "100%"), + Metric("lpm_l2_rd_requests", "L2 cache data read requests per second", + l2_dmnd_rrate, "requests/s"), + Metric("lpm_l2_rd_misses", "L2 cache data read misses per second", + l2_dmnd_mrate, "misses/s"), + ]), + MetricGroup("lpm_l2_hwpf", [ + Metric("lpm_l2_hwpf_hits", "L2 cache hardware prefetcher hits", + d_ratio(DC_PFH, l2_pf_all), "100%"), + Metric("lpm_l2_hwpf_misses", "L2 cache hardware prefetcher misses", + d_ratio(DC_PFM, l2_pf_all), "100%"), + Metric("lpm_l2_hwpf_useless", "L2 cache hardware prefetcher useless prefetches per second", + l2_useless_rate, "100%") if l2_useless_rate else None, + Metric("lpm_l2_hwpf_requests", "L2 cache hardware prefetcher requests per second", + l2_pf_rrate, "100%"), + Metric("lpm_l2_hwpf_misses", "L2 cache hardware prefetcher misses per second", + l2_pf_mrate, "100%"), + ]) if DC_PFH else None, + MetricGroup("lpm_l2_rfo", [ + Metric("lpm_l2_rfo_hits", "L2 cache request for ownership (RFO) hits", + d_ratio(DC_RFOH, l2_rfo_all), "100%"), + Metric("lpm_l2_rfo_misses", "L2 cache request for ownership (RFO) misses", + d_ratio(DC_RFOM, l2_rfo_all), "100%"), + Metric("lpm_l2_rfo_requests", "L2 cache request for ownership (RFO) requests per second", + l2_rfo_rrate, "requests/s"), + Metric("lpm_l2_rfo_misses", "L2 cache request for ownership (RFO) misses per second", + l2_rfo_mrate, "misses/s"), + ]) if DC_RFOH else None, + MetricGroup("lpm_l2_code", [ + Metric("lpm_l2_code_hits", "L2 cache code hits", + d_ratio(DC_CH, l2_code_all), "100%") if DC_CH else None, + Metric("lpm_l2_code_misses", "L2 cache code misses", + d_ratio(DC_CM, l2_code_all), "100%") if DC_CH else None, + Metric("lpm_l2_code_requests", "L2 cache code requests per second", + l2_code_rate, "requests/s") if DC_CH else None, + Metric("lpm_l2_code_misses", "L2 cache code misses per second", + l2_code_miss_rate, "misses/s"), + ]), + MetricGroup("lpm_l2_evict", [ + MetricGroup("lpm_l2_evict_mef_lines", [ + Metric("lpm_l2_evict_mef_lines_l3_hot_lru", "L2 evictions M/E/F lines L3 hot LRU per second", + wbu, "HotLRU/s") if wbu else None, + Metric("lpm_l2_evict_mef_lines_l3_norm_lru", "L2 evictions M/E/F lines L3 normal LRU per second", + wbn, "NormLRU/s") if wbn else None, + Metric("lpm_l2_evict_mef_lines_dropped", "L2 evictions M/E/F lines dropped per second", + wbd, "dropped/s") if wbd else None, + Metric("lpm_l2_evict_is_lines_dropped", "L2 evictions I/S lines dropped per second", + isd, "dropped/s") if isd else None, + ]), + ]), + ], description="L2 data cache analysis") + + def IntelPorts() -> Optional[MetricGroup]: pipeline_events = json.load( open(f"{_args.events_path}/x86/{_args.model}/pipeline.json")) @@ -386,6 +555,7 @@ def main() -> None: Smi(), Tsx(), IntelBr(), + IntelL2(), IntelPorts(), IntelSwpf(), ]) -- cgit v1.2.3 From d80edef23124baffa2cfd61a009933d03d982741 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:53 -0800 Subject: perf jevents: Add load store breakdown metrics ldst for Intel Give breakdown of number of instructions. Use the counter mask (cmask) to show the number of cycles taken to retire the instructions. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 87 +++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index d190d97f4aff..19a284b4c520 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -8,7 +8,7 @@ import re from typing import Optional from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, Literal, LoadEvents, - Metric, MetricGroup, MetricRef, Select) + Metric, MetricConstraint, MetricGroup, MetricRef, Select) # Global command line arguments. _args = None @@ -525,6 +525,90 @@ def IntelSwpf() -> Optional[MetricGroup]: ], description="Software prefetch instruction breakdown") +def IntelLdSt() -> Optional[MetricGroup]: + if _args.model in [ + "bonnell", + "nehalemep", + "nehalemex", + "westmereep-dp", + "westmereep-sp", + "westmereex", + ]: + return None + LDST_LD = Event("MEM_INST_RETIRED.ALL_LOADS", "MEM_UOPS_RETIRED.ALL_LOADS") + LDST_ST = Event("MEM_INST_RETIRED.ALL_STORES", + "MEM_UOPS_RETIRED.ALL_STORES") + LDST_LDC1 = Event(f"{LDST_LD.name}/cmask=1/") + LDST_STC1 = Event(f"{LDST_ST.name}/cmask=1/") + LDST_LDC2 = Event(f"{LDST_LD.name}/cmask=2/") + LDST_STC2 = Event(f"{LDST_ST.name}/cmask=2/") + LDST_LDC3 = Event(f"{LDST_LD.name}/cmask=3/") + LDST_STC3 = Event(f"{LDST_ST.name}/cmask=3/") + ins = Event("instructions") + LDST_CYC = Event("CPU_CLK_UNHALTED.THREAD", + "CPU_CLK_UNHALTED.CORE_P", + "CPU_CLK_UNHALTED.THREAD_P") + LDST_PRE = None + try: + LDST_PRE = Event("LOAD_HIT_PREFETCH.SWPF", "LOAD_HIT_PRE.SW_PF") + except: + pass + LDST_AT = None + try: + LDST_AT = Event("MEM_INST_RETIRED.LOCK_LOADS") + except: + pass + cyc = LDST_CYC + + ld_rate = d_ratio(LDST_LD, interval_sec) + st_rate = d_ratio(LDST_ST, interval_sec) + pf_rate = d_ratio(LDST_PRE, interval_sec) if LDST_PRE else None + at_rate = d_ratio(LDST_AT, interval_sec) if LDST_AT else None + + ldst_ret_constraint = MetricConstraint.GROUPED_EVENTS + if LDST_LD.name == "MEM_UOPS_RETIRED.ALL_LOADS": + ldst_ret_constraint = MetricConstraint.NO_GROUP_EVENTS_NMI + + return MetricGroup("lpm_ldst", [ + MetricGroup("lpm_ldst_total", [ + Metric("lpm_ldst_total_loads", "Load/store instructions total loads", + ld_rate, "loads"), + Metric("lpm_ldst_total_stores", "Load/store instructions total stores", + st_rate, "stores"), + ]), + MetricGroup("lpm_ldst_prcnt", [ + Metric("lpm_ldst_prcnt_loads", "Percent of all instructions that are loads", + d_ratio(LDST_LD, ins), "100%"), + Metric("lpm_ldst_prcnt_stores", "Percent of all instructions that are stores", + d_ratio(LDST_ST, ins), "100%"), + ]), + MetricGroup("lpm_ldst_ret_lds", [ + Metric("lpm_ldst_ret_lds_1", "Retired loads in 1 cycle", + d_ratio(max(LDST_LDC1 - LDST_LDC2, 0), cyc), "100%", + constraint=ldst_ret_constraint), + Metric("lpm_ldst_ret_lds_2", "Retired loads in 2 cycles", + d_ratio(max(LDST_LDC2 - LDST_LDC3, 0), cyc), "100%", + constraint=ldst_ret_constraint), + Metric("lpm_ldst_ret_lds_3", "Retired loads in 3 or more cycles", + d_ratio(LDST_LDC3, cyc), "100%"), + ]), + MetricGroup("lpm_ldst_ret_sts", [ + Metric("lpm_ldst_ret_sts_1", "Retired stores in 1 cycle", + d_ratio(max(LDST_STC1 - LDST_STC2, 0), cyc), "100%", + constraint=ldst_ret_constraint), + Metric("lpm_ldst_ret_sts_2", "Retired stores in 2 cycles", + d_ratio(max(LDST_STC2 - LDST_STC3, 0), cyc), "100%", + constraint=ldst_ret_constraint), + Metric("lpm_ldst_ret_sts_3", "Retired stores in 3 more cycles", + d_ratio(LDST_STC3, cyc), "100%"), + ]), + Metric("lpm_ldst_ld_hit_swpf", "Load hit software prefetches per second", + pf_rate, "swpf/s") if pf_rate else None, + Metric("lpm_ldst_atomic_lds", "Atomic loads per second", + at_rate, "loads/s") if at_rate else None, + ], description="Breakdown of load/store instructions") + + def main() -> None: global _args @@ -556,6 +640,7 @@ def main() -> None: Tsx(), IntelBr(), IntelL2(), + IntelLdSt(), IntelPorts(), IntelSwpf(), ]) -- cgit v1.2.3 From 59341f4e171170d3e2be50047508569a57eee829 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:54 -0800 Subject: perf jevents: Add ILP metrics for Intel Use the counter mask (cmask) to see how many cycles an instruction takes to retire. Present as a set of ILP metrics. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 19a284b4c520..bc3c50285916 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -263,6 +263,45 @@ def IntelBr(): description="breakdown of retired branch instructions") +def IntelIlp() -> MetricGroup: + tsc = Event("msr/tsc/") + c0 = Event("msr/mperf/") + low = tsc - c0 + inst_ret = Event("INST_RETIRED.ANY_P") + inst_ret_c = [Event(f"{inst_ret.name}/cmask={x}/") for x in range(1, 6)] + core_cycles = Event("CPU_CLK_UNHALTED.THREAD_P_ANY", + "CPU_CLK_UNHALTED.DISTRIBUTED", + "cycles") + ilp = [d_ratio(max(inst_ret_c[x] - inst_ret_c[x + 1], 0), core_cycles) + for x in range(0, 4)] + ilp.append(d_ratio(inst_ret_c[4], core_cycles)) + ilp0 = 1 + for x in ilp: + ilp0 -= x + return MetricGroup("lpm_ilp", [ + Metric("lpm_ilp_idle", "Lower power cycles as a percentage of all cycles", + d_ratio(low, tsc), "100%"), + Metric("lpm_ilp_inst_ret_0", + "Instructions retired in 0 cycles as a percentage of all cycles", + ilp0, "100%"), + Metric("lpm_ilp_inst_ret_1", + "Instructions retired in 1 cycles as a percentage of all cycles", + ilp[0], "100%"), + Metric("lpm_ilp_inst_ret_2", + "Instructions retired in 2 cycles as a percentage of all cycles", + ilp[1], "100%"), + Metric("lpm_ilp_inst_ret_3", + "Instructions retired in 3 cycles as a percentage of all cycles", + ilp[2], "100%"), + Metric("lpm_ilp_inst_ret_4", + "Instructions retired in 4 cycles as a percentage of all cycles", + ilp[3], "100%"), + Metric("lpm_ilp_inst_ret_5", + "Instructions retired in 5 or more cycles as a percentage of all cycles", + ilp[4], "100%"), + ]) + + def IntelL2() -> Optional[MetricGroup]: try: DC_HIT = Event("L2_RQSTS.DEMAND_DATA_RD_HIT") @@ -639,6 +678,7 @@ def main() -> None: Smi(), Tsx(), IntelBr(), + IntelIlp(), IntelL2(), IntelLdSt(), IntelPorts(), -- cgit v1.2.3 From 2f3d6ea05deca7c1c653e2e53d73ac5a81378d53 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:55 -0800 Subject: perf jevents: Add context switch metrics for Intel Metrics break down context switches for different kinds of instruction. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index bc3c50285916..9cf4bd8ac769 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -263,6 +263,63 @@ def IntelBr(): description="breakdown of retired branch instructions") +def IntelCtxSw() -> MetricGroup: + cs = Event("context\\-switches") + metrics = [ + Metric("lpm_cs_rate", "Context switches per second", + d_ratio(cs, interval_sec), "ctxsw/s") + ] + + ev = Event("instructions") + metrics.append(Metric("lpm_cs_instr", "Instructions per context switch", + d_ratio(ev, cs), "instr/cs")) + + ev = Event("cycles") + metrics.append(Metric("lpm_cs_cycles", "Cycles per context switch", + d_ratio(ev, cs), "cycles/cs")) + + try: + ev = Event("MEM_INST_RETIRED.ALL_LOADS", "MEM_UOPS_RETIRED.ALL_LOADS") + metrics.append(Metric("lpm_cs_loads", "Loads per context switch", + d_ratio(ev, cs), "loads/cs")) + except: + pass + + try: + ev = Event("MEM_INST_RETIRED.ALL_STORES", + "MEM_UOPS_RETIRED.ALL_STORES") + metrics.append(Metric("lpm_cs_stores", "Stores per context switch", + d_ratio(ev, cs), "stores/cs")) + except: + pass + + try: + ev = Event("BR_INST_RETIRED.NEAR_TAKEN", "BR_INST_RETIRED.TAKEN_JCC") + metrics.append(Metric("lpm_cs_br_taken", "Branches taken per context switch", + d_ratio(ev, cs), "br_taken/cs")) + except: + pass + + try: + l2_misses = (Event("L2_RQSTS.DEMAND_DATA_RD_MISS") + + Event("L2_RQSTS.RFO_MISS") + + Event("L2_RQSTS.CODE_RD_MISS")) + try: + l2_misses += Event("L2_RQSTS.HWPF_MISS", + "L2_RQSTS.L2_PF_MISS", "L2_RQSTS.PF_MISS") + except: + pass + + metrics.append(Metric("lpm_cs_l2_misses", "L2 misses per context switch", + d_ratio(l2_misses, cs), "l2_misses/cs")) + except: + pass + + return MetricGroup("lpm_cs", metrics, + description=("Number of context switches per second, instructions " + "retired & core cycles between context switches")) + + def IntelIlp() -> MetricGroup: tsc = Event("msr/tsc/") c0 = Event("msr/mperf/") @@ -678,6 +735,7 @@ def main() -> None: Smi(), Tsx(), IntelBr(), + IntelCtxSw(), IntelIlp(), IntelL2(), IntelLdSt(), -- cgit v1.2.3 From d666f0172ab306fa7b7af38499d51f4941460688 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:56 -0800 Subject: perf jevents: Add FPU metrics for Intel Metrics break down of floating point operations. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 97 ++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 9cf4bd8ac769..77b8e10194db 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -320,6 +320,102 @@ def IntelCtxSw() -> MetricGroup: "retired & core cycles between context switches")) +def IntelFpu() -> Optional[MetricGroup]: + cyc = Event("cycles") + try: + s_64 = Event("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", + "SIMD_INST_RETIRED.SCALAR_SINGLE") + except: + return None + d_64 = Event("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", + "SIMD_INST_RETIRED.SCALAR_DOUBLE") + s_128 = Event("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", + "SIMD_INST_RETIRED.PACKED_SINGLE") + + flop = s_64 + d_64 + 4 * s_128 + + d_128 = None + s_256 = None + d_256 = None + s_512 = None + d_512 = None + try: + d_128 = Event("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE") + flop += 2 * d_128 + s_256 = Event("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE") + flop += 8 * s_256 + d_256 = Event("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE") + flop += 4 * d_256 + s_512 = Event("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE") + flop += 16 * s_512 + d_512 = Event("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE") + flop += 8 * d_512 + except: + pass + + f_assist = Event("ASSISTS.FP", "FP_ASSIST.ANY", "FP_ASSIST.S") + if f_assist in [ + "ASSISTS.FP", + "FP_ASSIST.S", + ]: + f_assist += "/cmask=1/" + + flop_r = d_ratio(flop, interval_sec) + flop_c = d_ratio(flop, cyc) + nmi_constraint = MetricConstraint.GROUPED_EVENTS + if f_assist.name == "ASSISTS.FP": # Icelake+ + nmi_constraint = MetricConstraint.NO_GROUP_EVENTS_NMI + + def FpuMetrics(group: str, fl: Optional[Event], mult: int, desc: str) -> Optional[MetricGroup]: + if not fl: + return None + + f = fl * mult + fl_r = d_ratio(f, interval_sec) + r_s = d_ratio(fl, interval_sec) + return MetricGroup(group, [ + Metric(f"{group}_of_total", desc + " floating point operations per second", + d_ratio(f, flop), "100%"), + Metric(f"{group}_flops", desc + " floating point operations per second", + fl_r, "flops/s"), + Metric(f"{group}_ops", desc + " operations per second", + r_s, "ops/s"), + ]) + + return MetricGroup("lpm_fpu", [ + MetricGroup("lpm_fpu_total", [ + Metric("lpm_fpu_total_flops", "Floating point operations per second", + flop_r, "flops/s"), + Metric("lpm_fpu_total_flopc", "Floating point operations per cycle", + flop_c, "flops/cycle", constraint=nmi_constraint), + ]), + MetricGroup("lpm_fpu_64", [ + FpuMetrics("lpm_fpu_64_single", s_64, 1, "64-bit single"), + FpuMetrics("lpm_fpu_64_double", d_64, 1, "64-bit double"), + ]), + MetricGroup("lpm_fpu_128", [ + FpuMetrics("lpm_fpu_128_single", s_128, + 4, "128-bit packed single"), + FpuMetrics("lpm_fpu_128_double", d_128, + 2, "128-bit packed double"), + ]), + MetricGroup("lpm_fpu_256", [ + FpuMetrics("lpm_fpu_256_single", s_256, + 8, "128-bit packed single"), + FpuMetrics("lpm_fpu_256_double", d_256, + 4, "128-bit packed double"), + ]), + MetricGroup("lpm_fpu_512", [ + FpuMetrics("lpm_fpu_512_single", s_512, + 16, "128-bit packed single"), + FpuMetrics("lpm_fpu_512_double", d_512, + 8, "128-bit packed double"), + ]), + Metric("lpm_fpu_assists", "FP assists as a percentage of cycles", + d_ratio(f_assist, cyc), "100%"), + ]) + + def IntelIlp() -> MetricGroup: tsc = Event("msr/tsc/") c0 = Event("msr/mperf/") @@ -736,6 +832,7 @@ def main() -> None: Tsx(), IntelBr(), IntelCtxSw(), + IntelFpu(), IntelIlp(), IntelL2(), IntelLdSt(), -- cgit v1.2.3 From 426b8442898de9afe256e3c415d272808d6b69fb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:57 -0800 Subject: perf jevents: Add Miss Level Parallelism (MLP) metric for Intel Number of outstanding load misses per cycle. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 77b8e10194db..dddeae35e4b4 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -624,6 +624,20 @@ def IntelL2() -> Optional[MetricGroup]: ], description="L2 data cache analysis") +def IntelMlp() -> Optional[Metric]: + try: + l1d = Event("L1D_PEND_MISS.PENDING") + l1dc = Event("L1D_PEND_MISS.PENDING_CYCLES") + except: + return None + + l1dc = Select(l1dc / 2, Literal("#smt_on"), l1dc) + ml = d_ratio(l1d, l1dc) + return Metric("lpm_mlp", + "Miss level parallelism - number of outstanding load misses per cycle (higher is better)", + ml, "load_miss_pending/cycle") + + def IntelPorts() -> Optional[MetricGroup]: pipeline_events = json.load( open(f"{_args.events_path}/x86/{_args.model}/pipeline.json")) @@ -836,6 +850,7 @@ def main() -> None: IntelIlp(), IntelL2(), IntelLdSt(), + IntelMlp(), IntelPorts(), IntelSwpf(), ]) -- cgit v1.2.3 From 130f4245af99e140cc5dc93bdf8e59ffd9cc9d6f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:58 -0800 Subject: perf jevents: Add mem_bw metric for Intel Break down memory bandwidth using uncore counters. For many models this matches the memory_bandwidth_* metrics, but these metrics aren't made available on all models. Add support for free running counters. Query the event JSON when determining which what events/counters are available. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 62 ++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index dddeae35e4b4..f671d6e4fd67 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -815,6 +815,67 @@ def IntelLdSt() -> Optional[MetricGroup]: ], description="Breakdown of load/store instructions") +def UncoreMemBw() -> Optional[MetricGroup]: + mem_events = [] + try: + mem_events = json.load(open(f"{os.path.dirname(os.path.realpath(__file__))}" + f"/arch/x86/{args.model}/uncore-memory.json")) + except: + pass + + ddr_rds = 0 + ddr_wrs = 0 + ddr_total = 0 + for x in mem_events: + if "EventName" in x: + name = x["EventName"] + if re.search("^UNC_MC[0-9]+_RDCAS_COUNT_FREERUN", name): + ddr_rds += Event(name) + elif re.search("^UNC_MC[0-9]+_WRCAS_COUNT_FREERUN", name): + ddr_wrs += Event(name) + # elif re.search("^UNC_MC[0-9]+_TOTAL_REQCOUNT_FREERUN", name): + # ddr_total += Event(name) + + if ddr_rds == 0: + try: + ddr_rds = Event("UNC_M_CAS_COUNT.RD") + ddr_wrs = Event("UNC_M_CAS_COUNT.WR") + except: + return None + + ddr_total = ddr_rds + ddr_wrs + + pmm_rds = 0 + pmm_wrs = 0 + try: + pmm_rds = Event("UNC_M_PMM_RPQ_INSERTS") + pmm_wrs = Event("UNC_M_PMM_WPQ_INSERTS") + except: + pass + + pmm_total = pmm_rds + pmm_wrs + + scale = 64 / 1_000_000 + return MetricGroup("lpm_mem_bw", [ + MetricGroup("lpm_mem_bw_ddr", [ + Metric("lpm_mem_bw_ddr_read", "DDR memory read bandwidth", + d_ratio(ddr_rds, interval_sec), f"{scale}MB/s"), + Metric("lpm_mem_bw_ddr_write", "DDR memory write bandwidth", + d_ratio(ddr_wrs, interval_sec), f"{scale}MB/s"), + Metric("lpm_mem_bw_ddr_total", "DDR memory write bandwidth", + d_ratio(ddr_total, interval_sec), f"{scale}MB/s"), + ], description="DDR Memory Bandwidth"), + MetricGroup("lpm_mem_bw_pmm", [ + Metric("lpm_mem_bw_pmm_read", "PMM memory read bandwidth", + d_ratio(pmm_rds, interval_sec), f"{scale}MB/s"), + Metric("lpm_mem_bw_pmm_write", "PMM memory write bandwidth", + d_ratio(pmm_wrs, interval_sec), f"{scale}MB/s"), + Metric("lpm_mem_bw_pmm_total", "PMM memory write bandwidth", + d_ratio(pmm_total, interval_sec), f"{scale}MB/s"), + ], description="PMM Memory Bandwidth") if pmm_rds != 0 else None, + ], description="Memory Bandwidth") + + def main() -> None: global _args @@ -853,6 +914,7 @@ def main() -> None: IntelMlp(), IntelPorts(), IntelSwpf(), + UncoreMemBw(), ]) if _args.metricgroups: -- cgit v1.2.3 From cde9c1a5d92520a874b333591ac29aef37b0c0cb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:44:59 -0800 Subject: perf jevents: Add local/remote "mem" breakdown metrics for Intel Breakdown local and remote memory bandwidth, read and writes. The implementation uses the HA and CHA PMUs present in server models broadwellde, broadwellx cascadelakex, emeraldrapids, haswellx, icelakex, ivytown, sapphirerapids and skylakex. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index f671d6e4fd67..983e5021f3d3 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -815,6 +815,36 @@ def IntelLdSt() -> Optional[MetricGroup]: ], description="Breakdown of load/store instructions") +def UncoreMem() -> Optional[MetricGroup]: + try: + loc_rds = Event("UNC_CHA_REQUESTS.READS_LOCAL", + "UNC_H_REQUESTS.READS_LOCAL") + rem_rds = Event("UNC_CHA_REQUESTS.READS_REMOTE", + "UNC_H_REQUESTS.READS_REMOTE") + loc_wrs = Event("UNC_CHA_REQUESTS.WRITES_LOCAL", + "UNC_H_REQUESTS.WRITES_LOCAL") + rem_wrs = Event("UNC_CHA_REQUESTS.WRITES_REMOTE", + "UNC_H_REQUESTS.WRITES_REMOTE") + except: + return None + + scale = 64 / 1_000_000 + return MetricGroup("lpm_mem", [ + MetricGroup("lpm_mem_local", [ + Metric("lpm_mem_local_read", "Local memory read bandwidth not including directory updates", + d_ratio(loc_rds, interval_sec), f"{scale}MB/s"), + Metric("lpm_mem_local_write", "Local memory write bandwidth not including directory updates", + d_ratio(loc_wrs, interval_sec), f"{scale}MB/s"), + ]), + MetricGroup("lpm_mem_remote", [ + Metric("lpm_mem_remote_read", "Remote memory read bandwidth not including directory updates", + d_ratio(rem_rds, interval_sec), f"{scale}MB/s"), + Metric("lpm_mem_remote_write", "Remote memory write bandwidth not including directory updates", + d_ratio(rem_wrs, interval_sec), f"{scale}MB/s"), + ]), + ], description="Memory Bandwidth breakdown local vs. remote (remote requests in). directory updates not included") + + def UncoreMemBw() -> Optional[MetricGroup]: mem_events = [] try: @@ -914,6 +944,7 @@ def main() -> None: IntelMlp(), IntelPorts(), IntelSwpf(), + UncoreMem(), UncoreMemBw(), ]) -- cgit v1.2.3 From 2166b44be938420934181be34fe20deee6e46441 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:00 -0800 Subject: perf jevents: Add dir breakdown metrics for Intel Breakdown directory hit, misses and requests. The implementation uses the M2M and CHA PMUs present in server models broadwellde, broadwellx cascadelakex, emeraldrapids, icelakex, sapphirerapids and skylakex. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 36 ++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 983e5021f3d3..24ceb7f8719b 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -815,6 +815,41 @@ def IntelLdSt() -> Optional[MetricGroup]: ], description="Breakdown of load/store instructions") +def UncoreDir() -> Optional[MetricGroup]: + try: + m2m_upd = Event("UNC_M2M_DIRECTORY_UPDATE.ANY") + m2m_hits = Event("UNC_M2M_DIRECTORY_HIT.DIRTY_I") + # Turn the umask into a ANY rather than DIRTY_I filter. + m2m_hits.name += "/umask=0xFF,name=UNC_M2M_DIRECTORY_HIT.ANY/" + m2m_miss = Event("UNC_M2M_DIRECTORY_MISS.DIRTY_I") + # Turn the umask into a ANY rather than DIRTY_I filter. + m2m_miss.name += "/umask=0xFF,name=UNC_M2M_DIRECTORY_MISS.ANY/" + cha_upd = Event("UNC_CHA_DIR_UPDATE.HA") + # Turn the umask into a ANY rather than HA filter. + cha_upd.name += "/umask=3,name=UNC_CHA_DIR_UPDATE.ANY/" + except: + return None + + m2m_total = m2m_hits + m2m_miss + upd = m2m_upd + cha_upd # in cache lines + upd_r = upd / interval_sec + look_r = m2m_total / interval_sec + + scale = 64 / 1_000_000 # Cache lines to MB + return MetricGroup("lpm_dir", [ + Metric("lpm_dir_lookup_rate", "", + d_ratio(m2m_total, interval_sec), "requests/s"), + Metric("lpm_dir_lookup_hits", "", + d_ratio(m2m_hits, m2m_total), "100%"), + Metric("lpm_dir_lookup_misses", "", + d_ratio(m2m_miss, m2m_total), "100%"), + Metric("lpm_dir_update_requests", "", + d_ratio(m2m_upd + cha_upd, interval_sec), "requests/s"), + Metric("lpm_dir_update_bw", "", + d_ratio(m2m_upd + cha_upd, interval_sec), f"{scale}MB/s"), + ]) + + def UncoreMem() -> Optional[MetricGroup]: try: loc_rds = Event("UNC_CHA_REQUESTS.READS_LOCAL", @@ -944,6 +979,7 @@ def main() -> None: IntelMlp(), IntelPorts(), IntelSwpf(), + UncoreDir(), UncoreMem(), UncoreMemBw(), ]) -- cgit v1.2.3 From 1fee2701a7d35fa7285479c5c2ee6c2d9bd99526 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:01 -0800 Subject: perf jevents: Add C-State metrics from the PCU PMU for Intel Use occupancy events fixed in: https://lore.kernel.org/lkml/20240226201517.3540187-1-irogers@google.com/ Metrics are at the socket level referring to cores, not hyperthreads. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 24ceb7f8719b..118fe0fc05a3 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -815,6 +815,35 @@ def IntelLdSt() -> Optional[MetricGroup]: ], description="Breakdown of load/store instructions") +def UncoreCState() -> Optional[MetricGroup]: + try: + pcu_ticks = Event("UNC_P_CLOCKTICKS") + c0 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C0") + c3 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C3") + c6 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C6") + except: + return None + + num_cores = Literal("#num_cores") / Literal("#num_packages") + + max_cycles = pcu_ticks * num_cores + total_cycles = c0 + c3 + c6 + + # remove fused-off cores which show up in C6/C7. + c6 = Select(max(c6 - (total_cycles - max_cycles), 0), + total_cycles > max_cycles, + c6) + + return MetricGroup("lpm_cstate", [ + Metric("lpm_cstate_c0", "C-State cores in C0/C1", + d_ratio(c0, pcu_ticks), "cores"), + Metric("lpm_cstate_c3", "C-State cores in C3", + d_ratio(c3, pcu_ticks), "cores"), + Metric("lpm_cstate_c6", "C-State cores in C6/C7", + d_ratio(c6, pcu_ticks), "cores"), + ]) + + def UncoreDir() -> Optional[MetricGroup]: try: m2m_upd = Event("UNC_M2M_DIRECTORY_UPDATE.ANY") @@ -979,6 +1008,7 @@ def main() -> None: IntelMlp(), IntelPorts(), IntelSwpf(), + UncoreCState(), UncoreDir(), UncoreMem(), UncoreMemBw(), -- cgit v1.2.3 From 6ec3058e709cc63513bafe105bf48b512baabe04 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:02 -0800 Subject: perf jevents: Add local/remote miss latency metrics for Intel Derive from CBOX/CHA occupancy and inserts the average latency as is provided in Intel's uncore performance monitoring reference. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 70 ++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 118fe0fc05a3..037f9b2ea1b6 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -6,9 +6,10 @@ import math import os import re from typing import Optional -from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric, - JsonEncodeMetricGroupDescriptions, Literal, LoadEvents, - Metric, MetricConstraint, MetricGroup, MetricRef, Select) +from metric import (d_ratio, has_event, max, source_count, CheckPmu, Event, + JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, + Literal, LoadEvents, Metric, MetricConstraint, MetricGroup, + MetricRef, Select) # Global command line arguments. _args = None @@ -624,6 +625,68 @@ def IntelL2() -> Optional[MetricGroup]: ], description="L2 data cache analysis") +def IntelMissLat() -> Optional[MetricGroup]: + try: + ticks = Event("UNC_CHA_CLOCKTICKS", "UNC_C_CLOCKTICKS") + data_rd_loc_occ = Event("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", + "UNC_CHA_TOR_OCCUPANCY.IA_MISS", + "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE", + "UNC_C_TOR_OCCUPANCY.MISS_OPCODE") + data_rd_loc_ins = Event("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL", + "UNC_CHA_TOR_INSERTS.IA_MISS", + "UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE", + "UNC_C_TOR_INSERTS.MISS_OPCODE") + data_rd_rem_occ = Event("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE", + "UNC_CHA_TOR_OCCUPANCY.IA_MISS", + "UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE", + "UNC_C_TOR_OCCUPANCY.NID_MISS_OPCODE") + data_rd_rem_ins = Event("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE", + "UNC_CHA_TOR_INSERTS.IA_MISS", + "UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE", + "UNC_C_TOR_INSERTS.NID_MISS_OPCODE") + except: + return None + + if (data_rd_loc_occ.name == "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE" or + data_rd_loc_occ.name == "UNC_C_TOR_OCCUPANCY.MISS_OPCODE"): + data_rd = 0x182 + for e in [data_rd_loc_occ, data_rd_loc_ins, data_rd_rem_occ, data_rd_rem_ins]: + e.name += f"/filter_opc={hex(data_rd)}/" + elif data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS": + # Demand Data Read - Full cache-line read requests from core for + # lines to be cached in S or E, typically for data + demand_data_rd = 0x202 + # LLC Prefetch Data - Uncore will first look up the line in the + # LLC; for a cache hit, the LRU will be updated, on a miss, the + # DRd will be initiated + llc_prefetch_data = 0x25a + local_filter = (f"/filter_opc0={hex(demand_data_rd)}," + f"filter_opc1={hex(llc_prefetch_data)}," + "filter_loc,filter_nm,filter_not_nm/") + remote_filter = (f"/filter_opc0={hex(demand_data_rd)}," + f"filter_opc1={hex(llc_prefetch_data)}," + "filter_rem,filter_nm,filter_not_nm/") + for e in [data_rd_loc_occ, data_rd_loc_ins]: + e.name += local_filter + for e in [data_rd_rem_occ, data_rd_rem_ins]: + e.name += remote_filter + else: + assert data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", data_rd_loc_occ + + ticks_per_cha = ticks / source_count(data_rd_loc_ins) + loc_lat = interval_sec * 1e9 * data_rd_loc_occ / \ + (ticks_per_cha * data_rd_loc_ins) + ticks_per_cha = ticks / source_count(data_rd_rem_ins) + rem_lat = interval_sec * 1e9 * data_rd_rem_occ / \ + (ticks_per_cha * data_rd_rem_ins) + return MetricGroup("lpm_miss_lat", [ + Metric("lpm_miss_lat_loc", "Local to a socket miss latency in nanoseconds", + loc_lat, "ns"), + Metric("lpm_miss_lat_rem", "Remote to a socket miss latency in nanoseconds", + rem_lat, "ns"), + ]) + + def IntelMlp() -> Optional[Metric]: try: l1d = Event("L1D_PEND_MISS.PENDING") @@ -1005,6 +1068,7 @@ def main() -> None: IntelIlp(), IntelL2(), IntelLdSt(), + IntelMissLat(), IntelMlp(), IntelPorts(), IntelSwpf(), -- cgit v1.2.3 From 5dc81578ad77c298248a12de8b5e19923ef2c617 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:03 -0800 Subject: perf jevents: Add upi_bw metric for Intel Break down UPI read and write bandwidth using uncore_upi counters. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index 037f9b2ea1b6..f6bb691dc5bb 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -1033,6 +1033,27 @@ def UncoreMemBw() -> Optional[MetricGroup]: ], description="Memory Bandwidth") +def UncoreUpiBw() -> Optional[MetricGroup]: + try: + upi_rds = Event("UNC_UPI_RxL_FLITS.ALL_DATA") + upi_wrs = Event("UNC_UPI_TxL_FLITS.ALL_DATA") + except: + return None + + upi_total = upi_rds + upi_wrs + + # From "Uncore Performance Monitoring": When measuring the amount of + # bandwidth consumed by transmission of the data (i.e. NOT including + # the header), it should be .ALL_DATA / 9 * 64B. + scale = (64 / 9) / 1_000_000 + return MetricGroup("lpm_upi_bw", [ + Metric("lpm_upi_bw_read", "UPI read bandwidth", + d_ratio(upi_rds, interval_sec), f"{scale}MB/s"), + Metric("lpm_upi_bw_write", "DDR memory write bandwidth", + d_ratio(upi_wrs, interval_sec), f"{scale}MB/s"), + ], description="UPI Bandwidth") + + def main() -> None: global _args @@ -1076,6 +1097,7 @@ def main() -> None: UncoreDir(), UncoreMem(), UncoreMemBw(), + UncoreUpiBw(), ]) if _args.metricgroups: -- cgit v1.2.3 From e74f72a7e21782332bb7b9541634199278f3461b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:04 -0800 Subject: perf jevents: Add mesh bandwidth saturation metric for Intel Memory bandwidth saturation from CBOX/CHA events present in broadwellde, broadwellx, cascadelakex, haswellx, icelakex, skylakex and snowridgex. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/intel_metrics.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index f6bb691dc5bb..d56bab7337df 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -1033,6 +1033,22 @@ def UncoreMemBw() -> Optional[MetricGroup]: ], description="Memory Bandwidth") +def UncoreMemSat() -> Optional[Metric]: + try: + clocks = Event("UNC_CHA_CLOCKTICKS", "UNC_C_CLOCKTICKS") + sat = Event("UNC_CHA_DISTRESS_ASSERTED.VERT", "UNC_CHA_FAST_ASSERTED.VERT", + "UNC_C_FAST_ASSERTED") + except: + return None + + desc = ("Mesh Bandwidth saturation (% CBOX cycles with FAST signal asserted, " + "include QPI bandwidth saturation), lower is better") + if "UNC_CHA_" in sat.name: + desc = ("Mesh Bandwidth saturation (% CHA cycles with FAST signal asserted, " + "include UPI bandwidth saturation), lower is better") + return Metric("lpm_mem_sat", desc, d_ratio(sat, clocks), "100%") + + def UncoreUpiBw() -> Optional[MetricGroup]: try: upi_rds = Event("UNC_UPI_RxL_FLITS.ALL_DATA") @@ -1097,6 +1113,7 @@ def main() -> None: UncoreDir(), UncoreMem(), UncoreMemBw(), + UncoreMemSat(), UncoreUpiBw(), ]) -- cgit v1.2.3 From 82e53e7ae09a054b00cf3afdddf7c378351cf3e0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:05 -0800 Subject: perf jevents: Add cycles breakdown metric for arm64/AMD/Intel Breakdown cycles to user, kernel and guest. Add a common_metrics.py file for such metrics. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/Build | 2 +- tools/perf/pmu-events/amd_metrics.py | 2 ++ tools/perf/pmu-events/arm64_metrics.py | 5 ++++- tools/perf/pmu-events/common_metrics.py | 19 +++++++++++++++++++ tools/perf/pmu-events/intel_metrics.py | 2 ++ 5 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 tools/perf/pmu-events/common_metrics.py (limited to 'tools') diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index 68227614d0b1..ec964ed05974 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -52,7 +52,7 @@ $(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT) $(call rule_mkdir) $(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@ -GEN_METRIC_DEPS := pmu-events/metric.py +GEN_METRIC_DEPS := pmu-events/metric.py pmu-events/common_metrics.py # Generate AMD Json ZENS = $(shell ls -d pmu-events/arch/x86/amdzen*) diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py index 83e77ccc059e..e2defaffde3e 100755 --- a/tools/perf/pmu-events/amd_metrics.py +++ b/tools/perf/pmu-events/amd_metrics.py @@ -4,6 +4,7 @@ import argparse import math import os from typing import Optional +from common_metrics import Cycles from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, Literal, LoadEvents, Metric, MetricGroup, Select) @@ -475,6 +476,7 @@ def main() -> None: AmdItlb(), AmdLdSt(), AmdUpc(), + Cycles(), Idle(), Rapl(), UncoreL3(), diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py index ac717ca3513a..4ecda96d11fa 100755 --- a/tools/perf/pmu-events/arm64_metrics.py +++ b/tools/perf/pmu-events/arm64_metrics.py @@ -4,6 +4,7 @@ import argparse import os from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents, MetricGroup) +from common_metrics import Cycles # Global command line arguments. _args = None @@ -34,7 +35,9 @@ def main() -> None: directory = f"{_args.events_path}/arm64/{_args.vendor}/{_args.model}/" LoadEvents(directory) - all_metrics = MetricGroup("", []) + all_metrics = MetricGroup("", [ + Cycles(), + ]) if _args.metricgroups: print(JsonEncodeMetricGroupDescriptions(all_metrics)) diff --git a/tools/perf/pmu-events/common_metrics.py b/tools/perf/pmu-events/common_metrics.py new file mode 100644 index 000000000000..fcdfb9d3e648 --- /dev/null +++ b/tools/perf/pmu-events/common_metrics.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +from metric import (d_ratio, Event, Metric, MetricGroup) + + +def Cycles() -> MetricGroup: + cyc_k = Event("cpu\\-cycles:kHh") # exclude user and guest + cyc_g = Event("cpu\\-cycles:G") # exclude host + cyc_u = Event("cpu\\-cycles:uH") # exclude kernel, hypervisor and guest + cyc = cyc_k + cyc_g + cyc_u + + return MetricGroup("lpm_cycles", [ + Metric("lpm_cycles_total", "Total number of cycles", cyc, "cycles"), + Metric("lpm_cycles_user", "User cycles as a percentage of all cycles", + d_ratio(cyc_u, cyc), "100%"), + Metric("lpm_cycles_kernel", "Kernel cycles as a percentage of all cycles", + d_ratio(cyc_k, cyc), "100%"), + Metric("lpm_cycles_guest", "Hypervisor guest cycles as a percentage of all cycles", + d_ratio(cyc_g, cyc), "100%"), + ], description="cycles breakdown per privilege level (users, kernel, guest)") diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py index d56bab7337df..52035433b505 100755 --- a/tools/perf/pmu-events/intel_metrics.py +++ b/tools/perf/pmu-events/intel_metrics.py @@ -6,6 +6,7 @@ import math import os import re from typing import Optional +from common_metrics import Cycles from metric import (d_ratio, has_event, max, source_count, CheckPmu, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, Literal, LoadEvents, Metric, MetricConstraint, MetricGroup, @@ -1095,6 +1096,7 @@ def main() -> None: LoadEvents(directory) all_metrics = MetricGroup("", [ + Cycles(), Idle(), Rapl(), Smi(), -- cgit v1.2.3 From e205952db7717557f71f22baa96589f0a56d83c5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Jan 2026 10:45:06 -0800 Subject: perf jevents: Validate that all names given an Event Validate they exist in a JSON file from one directory found from one directory above the model's JSON directory. This avoids broken fallback encodings being created. Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/metric.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py index 2029b6e28365..585454828c2f 100644 --- a/tools/perf/pmu-events/metric.py +++ b/tools/perf/pmu-events/metric.py @@ -11,12 +11,14 @@ from typing import Dict, List, Optional, Set, Tuple, Union all_pmus = set() all_events = set() experimental_events = set() +all_events_all_models = set() def LoadEvents(directory: str) -> None: """Populate a global set of all known events for the purpose of validating Event names""" global all_pmus global all_events global experimental_events + global all_events_all_models all_events = { "context\\-switches", "cpu\\-cycles", @@ -42,6 +44,20 @@ def LoadEvents(directory: str) -> None: # The generated directory may be the same as the input, which # causes partial json files. Ignore errors. pass + all_events_all_models = all_events.copy() + for root, dirs, files in os.walk(directory + ".."): + for filename in files: + if filename.endswith(".json"): + try: + for x in json.load(open(f"{root}/{filename}")): + if "EventName" in x: + all_events_all_models.add(x["EventName"]) + elif "ArchStdEvent" in x: + all_events_all_models.add(x["ArchStdEvent"]) + except json.decoder.JSONDecodeError: + # The generated directory may be the same as the input, which + # causes partial json files. Ignore errors. + pass def CheckPmu(name: str) -> bool: @@ -64,6 +80,25 @@ def CheckEvent(name: str) -> bool: return name in all_events +def CheckEveryEvent(*names: str) -> None: + """Check all the events exist in at least one json file""" + global all_events_all_models + if len(all_events_all_models) == 0: + assert len(names) == 1, f"Cannot determine valid events in {names}" + # No events loaded so assume any event is good. + return + + for name in names: + # Remove trailing modifier. + if ':' in name: + name = name[:name.find(':')] + elif '/' in name: + name = name[:name.find('/')] + if any([name.startswith(x) for x in ['amd', 'arm', 'cpu', 'msr', 'power']]): + continue + if name not in all_events_all_models: + raise Exception(f"Is {name} a named json event?") + def IsExperimentalEvent(name: str) -> bool: global experimental_events @@ -403,6 +438,7 @@ class Event(Expression): def __init__(self, *args: str): error = "" + CheckEveryEvent(*args) for name in args: if CheckEvent(name): self.name = _FixEscapes(name) -- cgit v1.2.3 From b640d556a2b354863a9962747a01f67f31cbf4d8 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 28 Jan 2026 19:05:51 +0000 Subject: selftests/bpf: Remove xxd util dependency The verification signature header generation requires converting a binary certificate to a C array. Previously this only worked with xxd (part of vim-common package). As xxd may not be available on some systems building selftests, it makes sense to substitute it with more common utils: hexdump, wc, sed to generate equivalent C array output. Tested by generating header with both xxd and hexdump and comparing them. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20260128190552.242335-1-mykyta.yatsenko5@gmail.com --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index b8bf51b7a0b0..a3ea98211ea6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,7 +23,6 @@ test_tcpnotify_user test_libbpf xdping test_cpp -test_progs_verification_cert *.d *.subskel.h *.skel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2c2f68a171ed..c6bf4dfb1495 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -720,9 +720,12 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(Q)mkdir -p $(BUILD_DIR) $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) +# Generates a header with C array declaration, containing test_progs_verification_cert bytes $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) - $(Q)ln -fs $< test_progs_verification_cert && \ - xxd -i test_progs_verification_cert > $@ + $(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \ + hexdump -v -e '12/1 " 0x%02x," "\n"' $< | sed 's/0x ,//g; $$s/,$$//'; \ + echo "};"; \ + echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests @@ -898,8 +901,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ - $(OUTPUT)/FEATURE-DUMP.selftests \ - test_progs_verification_cert + $(OUTPUT)/FEATURE-DUMP.selftests .PHONY: docs docs-clean -- cgit v1.2.3 From 08a7491843224f8b96518fbe70d9e48163046054 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 28 Jan 2026 13:12:55 -0800 Subject: bpftool: Fix dependencies for static build When building selftests/bpf with EXTRA_LDFLAGS=-static the follwoing error happens: LINK /ws/linux/tools/testing/selftests/bpf/tools/build/bpftool/bootstrap/bpftool /usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-dso_dlfcn.o): in function `dlfcn_globallookup': [...] /usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-c_zlib.o): in function `zlib_oneshot_expand_block': (.text+0xc64): undefined reference to `uncompress' /usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-c_zlib.o): in function `zlib_oneshot_compress_block': (.text+0xce4): undefined reference to `compress' collect2: error: ld returned 1 exit status make[1]: *** [Makefile:252: /ws/linux/tools/testing/selftests/bpf/tools/build/bpftool/bootstrap/bpftool] Error 1 make: *** [Makefile:327: /ws/linux/tools/testing/selftests/bpf/tools/sbin/bpftool] Error 2 make: *** Waiting for unfinished jobs.... This is caused by wrong order of dependencies in the Makefile. Fix it. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260128211255.376933-1-ihor.solodrai@linux.dev --- tools/bpf/bpftool/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 5442073a2e42..519ea5cb8ab1 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -130,8 +130,8 @@ include $(FEATURES_DUMP) endif endif -LIBS = $(LIBBPF) -lelf -lz -lcrypto -LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto +LIBS = $(LIBBPF) -lelf -lcrypto -lz +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lcrypto -lz ifeq ($(feature-libelf-zstd),1) LIBS += -lzstd -- cgit v1.2.3 From 60d2c438c1bb705cdbf74ce8f12e6e141a4719b0 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 27 Jan 2026 12:59:12 +0100 Subject: bpf: Test nospec after dead stack write in helper Without the fix from the previous commit, the selftest fails: $ ./tools/testing/selftests/bpf/vmtest.sh -- \ ./test_progs -t verifier_unpriv [...] run_subtest:PASS:obj_open_mem 0 nsec libbpf: BTF loading error: -EPERM libbpf: Error loading .BTF into kernel: -EPERM. BTF is optional, ignoring. libbpf: prog 'unpriv_nospec_after_helper_stack_write': BPF program load failed: -EFAULT libbpf: prog 'unpriv_nospec_after_helper_stack_write': failed to load: -EFAULT libbpf: failed to load object 'verifier_unpriv' run_subtest:FAIL:unexpected_load_failure unexpected error: -14 (errno 14) VERIFIER LOG: ============= 0: R1=ctx() R10=fp0 0: (b7) r0 = 0 ; R0=P0 1: (55) if r0 != 0x1 goto pc+6 2: R0=Pscalar() R1=ctx() R10=fp0 2: (b7) r2 = 0 ; R2=P0 3: (bf) r3 = r10 ; R3=fp0 R10=fp0 4: (07) r3 += -16 ; R3=fp-16 5: (b7) r4 = 4 ; R4=P4 6: (b7) r5 = 0 ; R5=P0 7: (85) call bpf_skb_load_bytes_relative#68 verifier bug: speculation barrier after jump instruction may not have the desired effect (BPF_CLASS(insn->code) == BPF_JMP || BPF_CLASS(insn->code) == BPF_JMP32) processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 ============= [...] The test is based on the PoC from the report. Signed-off-by: Luis Gerhorst Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reported-by: Dongliang Mu Link: https://lore.kernel.org/bpf/7678017d-b760-4053-a2d8-a6879b0dbeeb@hust.edu.cn/ Link: https://lore.kernel.org/r/20260127115912.3026761-3-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_unpriv.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c index 28b4f7035ceb..8ee1243e62a8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c +++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c @@ -950,4 +950,26 @@ l3_%=: r0 = 0; \ " ::: __clobber_all); } +SEC("socket") +__description("unpriv: nospec after dead stack write in helper") +__success __success_unpriv +__retval(0) +/* Dead code sanitizer rewrites the call to `goto -1`. */ +__naked void unpriv_dead_helper_stack_write_nospec_result(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 != 1 goto l0_%=; \ + r2 = 0; \ + r3 = r10; \ + r3 += -16; \ + r4 = 4; \ + r5 = 0; \ + call %[bpf_skb_load_bytes_relative]; \ +l0_%=: exit; \ +" : + : __imm(bpf_skb_load_bytes_relative) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6080d525aba8a6cab9fe4b841ca1fab48a2969c6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 12:38:28 +0000 Subject: selftest: packetdrill: add tcp_timestamping_tcp_tx_timestamp_bug.pkt Test tcp_tx_timestamp() behavior after ("tcp: tcp_tx_timestamp() must look at the rtx queue"). Without the fix, this new test fails like this: tcp_timestamping_tcp_tx_timestamp_bug.pkt:55: runtime error in recvmsg call: Expected result 0 but got -1 with errno 11 (Resource temporarily unavailable) Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20260127123828.4098577-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../tcp_timestamping_tcp_tx_timestamp_bug.pkt | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt (limited to 'tools') diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt new file mode 100644 index 000000000000..95a1957a2cf9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test after "tcp: tcp_tx_timestamp() must look at the rtx queue" + +// This test is about receiving the SCM_TSTAMP_ACK, +// we do not care about its SCM_TIMESTAMPING precision. +--tolerance_usecs=1000000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_min_tso_segs=70 +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) ++.010 < S. 0:0(0) ack 1 win 65535 + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [30000], 4) = 0 + + +0 write(3, ..., 9880) = 9880 + +0 > P. 1:9881(9880) ack 1 ++.010 < . 1:1(0) ack 9881 win 10000 + + +0 write(3, ..., 19760) = 19760 + +0 > P. 9881:29641(19760) ack 1 ++.010 < . 1:1(0) ack 29641 win 10000 + + +0 write(3, ..., 39520) = 39520 + +0 > P. 29641:69161(39520) ack 1 ++.010 < . 1:1(0) ack 69161 win 10000 + +// One more write to increase cwnd + +0 write(3, ..., 79040) = 79040 + +0 > P. 69161:108681(39520) ack 1 + +0 > P. 108681:148201(39520) ack 1 ++.010 < . 1:1(0) ack 148201 win 1000 + + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// We have one write filling one skb +// last byte can not be stored because of our small SO_SNDBUF + +0 write(3, ..., 65209) = 65208 + +0 > P. 148201:213409(65208) ack 1 ++.010 < . 1:1(0) ack 213409 win 1000 + +// SCM_TSTAMP_ACK should be received after the last ack at +// t=60ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=60000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=65207}} + ]}, MSG_ERRQUEUE) = 0 -- cgit v1.2.3 From 70de46740b62b83198802bfe6682c5f865c25dc5 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 27 Jan 2026 08:30:55 -0800 Subject: selftests: drv-net: psp: fix test flakes from racy connection close There is a bug in assoc_sk_only_mismatch() and assoc_sk_only_mismatch_tx() that creates a race condition which triggers test flakes in later test cases e.g. data_send_bad_key(). The problem is that the client uses the "conn clr" rpc to setup a data connection with psp_responder, but never uses a matching "data close" rpc. This creates a race condition where if the client can queue another data sock request, like in data_send_bad_key(), before the server can accept the old connection from the backlog we end up in a situation where we have two connections in the backlog: one for the closed connection we have received a FIN for, and one for the new PSP connection which is expecting to do key exchange. From there the server pops the closed connection from the backlog, but the data_send_bad_key() test case in psp.py hangs waiting to perform key exchange. The fix is to properly use _conn_close, which fill force the server to remove the closed connection from the backlog before sending the RPC ack to the client. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20260127-psp-flaky-test-v1-1-13403e390af3@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/psp.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 528a421ecf76..864d9fce1094 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -266,6 +266,7 @@ def assoc_sk_only_mismatch(cfg): the_exception = cm.exception ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id") ksft_eq(the_exception.nl_msg.error, -errno.EINVAL) + _close_conn(cfg, s) def assoc_sk_only_mismatch_tx(cfg): @@ -283,6 +284,7 @@ def assoc_sk_only_mismatch_tx(cfg): the_exception = cm.exception ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id") ksft_eq(the_exception.nl_msg.error, -errno.EINVAL) + _close_conn(cfg, s) def assoc_sk_only_unconn(cfg): -- cgit v1.2.3 From 5fc90003de59b0f772a1654f5609fa5f87b4300f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 17:48:06 +0000 Subject: selftests: drv-net: toeplitz: accept bigger rss keys /proc/sys/net/core/netdev_rss_key got bigger (256 bytes instead of 52) Fixes: 37b0ea8fef56 ("net: expand NETDEV_RSS_KEY_LEN to 256 bytes") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260127174806.886561-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/toeplitz.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c index 285bb17df9c2..cd4bf58a44ee 100644 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.c +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -59,7 +59,7 @@ #include "../../../net/lib/ksft.h" #define TOEPLITZ_KEY_MIN_LEN 40 -#define TOEPLITZ_KEY_MAX_LEN 60 +#define TOEPLITZ_KEY_MAX_LEN 256 #define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */ #define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN) -- cgit v1.2.3 From 8467458dfa61b37e259e3485a5d3e415d08193c1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 27 Jan 2026 20:27:24 +0100 Subject: selftests: mptcp: check no dup close events after error This validates the previous commit: subflow closed events are re-sent with less info when the initial subflow is disconnected after an error and each time a subflow is closed after that. In this new test, the userspace PM is involved because that's how it was discovered, but it is not specific to it. The initial subflow is terminated with a RESET, and that will cause the subflow disconnect. Then, a new subflow is initiated, but also got rejected, which cause a second subflow closed event, but not a third one. While at it, in case of failure to get the expected amount of events, the events are printed. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: d82809b6c5f2 ("mptcp: avoid duplicated SUB_CLOSED events") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-2-7f71e1bc4feb@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 51 +++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b2e6e548f796..1765714a1e2f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3872,11 +3872,32 @@ chk_evt_nr() count=$(grep -cw "type:${evt}" "${evts}") if [ "${count}" != "${exp}" ]; then fail_test "got ${count} events, expected ${exp}" + cat "${evts}" else print_ok fi } +# $1: ns ; $2: event type ; $3: expected count +wait_event() +{ + local ns="${1}" + local evt_name="${2}" + local exp="${3}" + + local evt="${!evt_name}" + local evts="${evts_ns1}" + local count + + [ "${ns}" == "ns2" ] && evts="${evts_ns2}" + + for _ in $(seq 100); do + count=$(grep -cw "type:${evt}" "${evts}") + [ "${count}" -ge "${exp}" ] && break + sleep 0.1 + done +} + userspace_tests() { # userspace pm type prevents add_addr @@ -4085,6 +4106,36 @@ userspace_tests() kill_events_pids mptcp_lib_kill_group_wait $tests_pid fi + + # userspace pm no duplicated spurious close events after an error + if reset_with_events "userspace pm no dup close events after error" && + continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then + set_userspace_pm $ns2 + pm_nl_set_limits $ns1 0 2 + { timeout_test=120 test_linkfail=128 speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null + local tests_pid=$! + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + userspace_pm_add_sf $ns2 10.0.3.2 20 + chk_mptcp_info subflows 1 subflows 1 + chk_subflows_total 2 2 + + # force quick loss + ip netns exec $ns2 sysctl -q net.ipv4.tcp_syn_retries=1 + if ip netns exec "${ns1}" ${iptables} -A INPUT -s "10.0.1.2" \ + -p tcp --tcp-option 30 -j REJECT --reject-with tcp-reset && + ip netns exec "${ns2}" ${iptables} -A INPUT -d "10.0.1.2" \ + -p tcp --tcp-option 30 -j REJECT --reject-with tcp-reset; then + wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 1 + wait_event ns1 MPTCP_LIB_EVENT_SUB_CLOSED 1 + chk_subflows_total 1 1 + userspace_pm_add_sf $ns2 10.0.1.2 0 + wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + fi + kill_events_pids + mptcp_lib_kill_group_wait $tests_pid + fi } endpoint_tests() -- cgit v1.2.3 From 2ef9e3a3845d0a20b62b01f5b731debd0364688d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 27 Jan 2026 20:27:26 +0100 Subject: selftests: mptcp: check subflow errors in close events This validates the previous commit: subflow closed events should contain an error field when a subflow got closed with an error, e.g. reset or timeout. For this test, the chk_evt_nr helper has been extended to check attributes in the matched events. In this test, the 2 subflow closed events should have an error. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-4-7f71e1bc4feb@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 1765714a1e2f..3fc29201362a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3847,21 +3847,28 @@ userspace_pm_chk_get_addr() fi } -# $1: ns ; $2: event type ; $3: count +# $1: ns ; $2: event type ; $3: count ; [ $4: attr ; $5: attr count ] chk_evt_nr() { local ns=${1} local evt_name="${2}" local exp="${3}" + local attr="${4}" + local attr_exp="${5}" local evts="${evts_ns1}" local evt="${!evt_name}" + local attr_name local count + if [ -n "${attr}" ]; then + attr_name=", ${attr}: ${attr_exp}" + fi + evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_ [ "${ns}" == "ns2" ] && evts="${evts_ns2}" - print_check "event ${ns} ${evt_name} (${exp})" + print_check "event ${ns} ${evt_name} (${exp}${attr_name})" if [[ "${evt_name}" = "LISTENER_"* ]] && ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then @@ -3873,6 +3880,16 @@ chk_evt_nr() if [ "${count}" != "${exp}" ]; then fail_test "got ${count} events, expected ${exp}" cat "${evts}" + return + elif [ -z "${attr}" ]; then + print_ok + return + fi + + count=$(grep -w "type:${evt}" "${evts}" | grep -c ",${attr}:") + if [ "${count}" != "${attr_exp}" ]; then + fail_test "got ${count} event attributes, expected ${attr_exp}" + grep -w "type:${evt}" "${evts}" else print_ok fi @@ -4131,7 +4148,7 @@ userspace_tests() chk_subflows_total 1 1 userspace_pm_add_sf $ns2 10.0.1.2 0 wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 - chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 error 2 fi kill_events_pids mptcp_lib_kill_group_wait $tests_pid -- cgit v1.2.3 From c5d5ecf21fdd9ce91e6116feb3aa83cee73352cc Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 27 Jan 2026 20:27:27 +0100 Subject: selftests: mptcp: join: fix local endp not being tracked When running this mptcp_join.sh selftest on older kernel versions not supporting local endpoints tracking, this test fails because 3 MP_JOIN ACKs have been received, while only 2 were expected. It is not clear why only 2 MP_JOIN ACKs were expected on old kernel versions, while 3 MP_JOIN SYN and SYN+ACK were expected. When testing on the v5.15.197 kernel, 3 MP_JOIN ACKs are seen, which is also what is expected in the selftests included in this kernel version, see commit f4480eaad489 ("selftests: mptcp: add missing join check"). Switch the expected MP_JOIN ACKs to 3. While at it, move this chk_join_nr helper out of the special condition for older kernel versions as it is now the same as with more recent ones. Also, invert the condition to be more logical: what's expected on newer kernel versions having such helper first. Fixes: d4c81bbb8600 ("selftests: mptcp: join: support local endpoint being tracked or not") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-5-7f71e1bc4feb@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 3fc29201362a..e70d3420954f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2329,17 +2329,16 @@ signal_address_tests() ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 3 3 3 # It is not directly linked to the commit introducing this # symbol but for the parent one which is linked anyway. - if ! mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then - chk_join_nr 3 3 2 - chk_add_nr 4 4 - else - chk_join_nr 3 3 3 + if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then # the server will not signal the address terminating # the MPC subflow chk_add_nr 3 3 + else + chk_add_nr 4 4 fi fi } -- cgit v1.2.3 From 5e51803521938566bdf099501379a9bdbacb6066 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 22 Jan 2026 18:46:17 +0100 Subject: selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest Similar to IPIP, introduce specific selftest for IP6IP6 flowtable SW acceleration in nft_flowtable.sh Signed-off-by: Lorenzo Bianconi Signed-off-by: Florian Westphal --- .../selftests/net/netfilter/nft_flowtable.sh | 62 ++++++++++++++++++---- 1 file changed, 53 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a68bc882fa4e..14d7f67715ed 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -592,16 +592,28 @@ ip -net "$nsr1" link set tun0 up ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null +ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 +ip -net "$nsr1" link set tun6 up +ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad + ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 ip -net "$nsr2" link set tun0 up ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null +ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 +ip -net "$nsr2" link set tun6 up +ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad + ip -net "$nsr1" route change default via 192.168.100.2 ip -net "$nsr2" route change default via 192.168.100.1 +ip -6 -net "$nsr1" route change default via fee1:3::2 +ip -6 -net "$nsr2" route change default via fee1:3::1 ip -net "$ns2" route add default via 10.0.2.1 +ip -6 -net "$ns2" route add default via dead:2::1 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6 accept' ip netns exec "$nsr1" nft -a insert rule inet filter forward \ 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' @@ -611,28 +623,51 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Create vlan tagged devices for IPIP traffic. ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 ip -net "$nsr1" link set veth1.10 up ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip -net "$nsr1" addr add fee1:4::1/64 dev veth1.10 nodad ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' -ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 -ip -net "$nsr1" link set tun1 up -ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 + +ip -net "$nsr1" link add name tun0.10 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun0.10 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun0.10 ip -net "$nsr1" route change default via 192.168.200.2 -ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null -ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept' + +ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 +ip -net "$nsr1" link set tun6.10 up +ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad +ip -6 -net "$nsr1" route change default via fee1:5::2 +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept' ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 ip -net "$nsr2" link set veth0.10 up ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip -net "$nsr2" addr add fee1:4::2/64 dev veth0.10 nodad ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null -ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 -ip -net "$nsr2" link set tun1 up -ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 + +ip -net "$nsr2" link add name tun0.10 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun0.10 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10 ip -net "$nsr2" route change default via 192.168.200.1 -ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 +ip -net "$nsr2" link set tun6.10 up +ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad +ip -6 -net "$nsr2" route change default via fee1:5::1 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 @@ -640,10 +675,19 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel over vlan" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Restore the previous configuration ip -net "$nsr1" route change default via 192.168.10.2 ip -net "$nsr2" route change default via 192.168.10.1 ip -net "$ns2" route del default via 10.0.2.1 +ip -6 -net "$ns2" route del default via dead:2::1 } # Another test: -- cgit v1.2.3 From 462a94fb8ae8ba0d4d3901c7283b4af052ab8804 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sun, 25 Jan 2026 21:09:55 -0700 Subject: riscv: hwprobe: add support for RISCV_HWPROBE_KEY_IMA_EXT_1 We've run out of bits to describe RISC-V ISA extensions in our initial hwprobe key, RISCV_HWPROBE_KEY_IMA_EXT_0. So, let's add RISCV_HWPROBE_KEY_IMA_EXT_1, along with the framework to set the appropriate hwprobe tuple, and add testing for it. Based on a suggestion from Andrew Jones , also fix the documentation for RISCV_HWPROBE_KEY_IMA_EXT_0. Reviewed-by: Andrew Jones Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/hwprobe.rst | 6 +- arch/riscv/include/asm/hwprobe.h | 3 +- arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/kernel/sys_hwprobe.c | 169 ++++++++++++--------- tools/testing/selftests/riscv/hwprobe/which-cpus.c | 18 ++- 5 files changed, 121 insertions(+), 76 deletions(-) (limited to 'tools') diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 641ec4abb906..c420a8349bc6 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -67,7 +67,7 @@ The following keys are defined: programs (it may still be executed in userspace via a kernel-controlled mechanism such as the vDSO). -* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing the extensions +* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing extensions that are compatible with the :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. @@ -387,3 +387,7 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE`: An unsigned int which represents the size of the Zicbop block in bytes. + +* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_1`: A bitmask containing additional + extensions that are compatible with the + :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 8c572a464719..8b9f5e1cf4cb 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 15 +#define RISCV_HWPROBE_MAX_KEY 16 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { @@ -20,6 +20,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key) switch (key) { case RISCV_HWPROBE_KEY_BASE_BEHAVIOR: case RISCV_HWPROBE_KEY_IMA_EXT_0: + case RISCV_HWPROBE_KEY_IMA_EXT_1: case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0: diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index cd3c126730c3..ed2621a5a47d 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -113,6 +113,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 13 #define RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0 14 #define RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE 15 +#define RISCV_HWPROBE_KEY_IMA_EXT_1 16 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index e6787ba7f2fc..53731ace7984 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -24,6 +24,14 @@ #include +#define EXT_KEY(isa_arg, ext, pv, missing) \ + do { \ + if (__riscv_isa_extension_available(isa_arg, RISCV_ISA_EXT_##ext)) \ + pv |= RISCV_HWPROBE_EXT_##ext; \ + else \ + missing |= RISCV_HWPROBE_EXT_##ext; \ + } while (false) + static void hwprobe_arch_id(struct riscv_hwprobe *pair, const struct cpumask *cpus) { @@ -93,90 +101,109 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, for_each_cpu(cpu, cpus) { struct riscv_isainfo *isainfo = &hart_isa[cpu]; -#define EXT_KEY(ext) \ - do { \ - if (__riscv_isa_extension_available(isainfo->isa, RISCV_ISA_EXT_##ext)) \ - pair->value |= RISCV_HWPROBE_EXT_##ext; \ - else \ - missing |= RISCV_HWPROBE_EXT_##ext; \ - } while (false) - /* * Only use EXT_KEY() for extensions which can be exposed to userspace, * regardless of the kernel's configuration, as no other checks, besides * presence in the hart_isa bitmap, are made. */ - EXT_KEY(ZAAMO); - EXT_KEY(ZABHA); - EXT_KEY(ZACAS); - EXT_KEY(ZALASR); - EXT_KEY(ZALRSC); - EXT_KEY(ZAWRS); - EXT_KEY(ZBA); - EXT_KEY(ZBB); - EXT_KEY(ZBC); - EXT_KEY(ZBKB); - EXT_KEY(ZBKC); - EXT_KEY(ZBKX); - EXT_KEY(ZBS); - EXT_KEY(ZCA); - EXT_KEY(ZCB); - EXT_KEY(ZCLSD); - EXT_KEY(ZCMOP); - EXT_KEY(ZICBOM); - EXT_KEY(ZICBOP); - EXT_KEY(ZICBOZ); - EXT_KEY(ZICNTR); - EXT_KEY(ZICOND); - EXT_KEY(ZIHINTNTL); - EXT_KEY(ZIHINTPAUSE); - EXT_KEY(ZIHPM); - EXT_KEY(ZILSD); - EXT_KEY(ZIMOP); - EXT_KEY(ZKND); - EXT_KEY(ZKNE); - EXT_KEY(ZKNH); - EXT_KEY(ZKSED); - EXT_KEY(ZKSH); - EXT_KEY(ZKT); - EXT_KEY(ZTSO); + EXT_KEY(isainfo->isa, ZAAMO, pair->value, missing); + EXT_KEY(isainfo->isa, ZABHA, pair->value, missing); + EXT_KEY(isainfo->isa, ZACAS, pair->value, missing); + EXT_KEY(isainfo->isa, ZALASR, pair->value, missing); + EXT_KEY(isainfo->isa, ZALRSC, pair->value, missing); + EXT_KEY(isainfo->isa, ZAWRS, pair->value, missing); + EXT_KEY(isainfo->isa, ZBA, pair->value, missing); + EXT_KEY(isainfo->isa, ZBB, pair->value, missing); + EXT_KEY(isainfo->isa, ZBC, pair->value, missing); + EXT_KEY(isainfo->isa, ZBKB, pair->value, missing); + EXT_KEY(isainfo->isa, ZBKC, pair->value, missing); + EXT_KEY(isainfo->isa, ZBKX, pair->value, missing); + EXT_KEY(isainfo->isa, ZBS, pair->value, missing); + EXT_KEY(isainfo->isa, ZCA, pair->value, missing); + EXT_KEY(isainfo->isa, ZCB, pair->value, missing); + EXT_KEY(isainfo->isa, ZCLSD, pair->value, missing); + EXT_KEY(isainfo->isa, ZCMOP, pair->value, missing); + EXT_KEY(isainfo->isa, ZICBOM, pair->value, missing); + EXT_KEY(isainfo->isa, ZICBOP, pair->value, missing); + EXT_KEY(isainfo->isa, ZICBOZ, pair->value, missing); + EXT_KEY(isainfo->isa, ZICNTR, pair->value, missing); + EXT_KEY(isainfo->isa, ZICOND, pair->value, missing); + EXT_KEY(isainfo->isa, ZIHINTNTL, pair->value, missing); + EXT_KEY(isainfo->isa, ZIHINTPAUSE, pair->value, missing); + EXT_KEY(isainfo->isa, ZIHPM, pair->value, missing); + EXT_KEY(isainfo->isa, ZILSD, pair->value, missing); + EXT_KEY(isainfo->isa, ZIMOP, pair->value, missing); + EXT_KEY(isainfo->isa, ZKND, pair->value, missing); + EXT_KEY(isainfo->isa, ZKNE, pair->value, missing); + EXT_KEY(isainfo->isa, ZKNH, pair->value, missing); + EXT_KEY(isainfo->isa, ZKSED, pair->value, missing); + EXT_KEY(isainfo->isa, ZKSH, pair->value, missing); + EXT_KEY(isainfo->isa, ZKT, pair->value, missing); + EXT_KEY(isainfo->isa, ZTSO, pair->value, missing); /* * All the following extensions must depend on the kernel * support of V. */ if (has_vector()) { - EXT_KEY(ZVBB); - EXT_KEY(ZVBC); - EXT_KEY(ZVE32F); - EXT_KEY(ZVE32X); - EXT_KEY(ZVE64D); - EXT_KEY(ZVE64F); - EXT_KEY(ZVE64X); - EXT_KEY(ZVFBFMIN); - EXT_KEY(ZVFBFWMA); - EXT_KEY(ZVFH); - EXT_KEY(ZVFHMIN); - EXT_KEY(ZVKB); - EXT_KEY(ZVKG); - EXT_KEY(ZVKNED); - EXT_KEY(ZVKNHA); - EXT_KEY(ZVKNHB); - EXT_KEY(ZVKSED); - EXT_KEY(ZVKSH); - EXT_KEY(ZVKT); + EXT_KEY(isainfo->isa, ZVBB, pair->value, missing); + EXT_KEY(isainfo->isa, ZVBC, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE32F, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE32X, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE64D, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE64F, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE64X, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFBFMIN, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFBFWMA, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFH, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFHMIN, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKB, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKG, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKNED, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKNHA, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKNHB, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKSED, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKSH, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKT, pair->value, missing); } - EXT_KEY(ZCD); - EXT_KEY(ZCF); - EXT_KEY(ZFA); - EXT_KEY(ZFBFMIN); - EXT_KEY(ZFH); - EXT_KEY(ZFHMIN); + EXT_KEY(isainfo->isa, ZCD, pair->value, missing); + EXT_KEY(isainfo->isa, ZCF, pair->value, missing); + EXT_KEY(isainfo->isa, ZFA, pair->value, missing); + EXT_KEY(isainfo->isa, ZFBFMIN, pair->value, missing); + EXT_KEY(isainfo->isa, ZFH, pair->value, missing); + EXT_KEY(isainfo->isa, ZFHMIN, pair->value, missing); if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) - EXT_KEY(SUPM); -#undef EXT_KEY + EXT_KEY(isainfo->isa, SUPM, pair->value, missing); + } + + /* Now turn off reporting features if any CPU is missing it. */ + pair->value &= ~missing; +} + +static void hwprobe_isa_ext1(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + int cpu; + u64 missing = 0; + + pair->value = 0; + + /* + * Loop through and record extensions that 1) anyone has, and 2) anyone + * doesn't have. + */ + for_each_cpu(cpu, cpus) { + /* struct riscv_isainfo *isainfo = &hart_isa[cpu]; */ + + /* + * Only use EXT_KEY() for extensions which can be + * exposed to userspace, regardless of the kernel's + * configuration, as no other checks, besides presence + * in the hart_isa bitmap, are made. + */ + /* Nothing here yet */ } /* Now turn off reporting features if any CPU is missing it. */ @@ -287,6 +314,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, hwprobe_isa_ext0(pair, cpus); break; + case RISCV_HWPROBE_KEY_IMA_EXT_1: + hwprobe_isa_ext1(pair, cpus); + break; + case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF: pair->value = hwprobe_misaligned(cpus); diff --git a/tools/testing/selftests/riscv/hwprobe/which-cpus.c b/tools/testing/selftests/riscv/hwprobe/which-cpus.c index 3ab53067e8dd..587feb198c04 100644 --- a/tools/testing/selftests/riscv/hwprobe/which-cpus.c +++ b/tools/testing/selftests/riscv/hwprobe/which-cpus.c @@ -83,9 +83,9 @@ static void do_which_cpus(int argc, char **argv, cpu_set_t *cpus) int main(int argc, char **argv) { - struct riscv_hwprobe pairs[2]; + struct riscv_hwprobe pairs[3]; cpu_set_t cpus_aff, cpus; - __u64 ext0_all; + __u64 ext0_all, ext1_all; long rc; rc = sched_getaffinity(0, sizeof(cpu_set_t), &cpus_aff); @@ -112,6 +112,11 @@ int main(int argc, char **argv) assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_0); ext0_all = pairs[0].value; + pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, }; + rc = riscv_hwprobe(pairs, 1, 0, NULL, 0); + assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_1); + ext1_all = pairs[0].value; + pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; CPU_ZERO(&cpus); rc = riscv_hwprobe(pairs, 1, 0, (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); @@ -134,20 +139,23 @@ int main(int argc, char **argv) pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, }; CPU_ZERO(&cpus); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == sysconf(_SC_NPROCESSORS_ONLN), "set all cpus\n"); pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, }; memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t)); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_EQUAL(&cpus, &cpus_aff), "set all affinity cpus\n"); pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ~ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ~ext1_all, }; memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t)); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == 0, "clear all cpus\n"); ksft_finished(); -- cgit v1.2.3 From d30c1683aaecb93d2ab95685dc4300a33d3cea7a Mon Sep 17 00:00:00 2001 From: Deepak Gupta Date: Sun, 25 Jan 2026 21:09:56 -0700 Subject: kselftest/riscv: add kselftest for user mode CFI Add a kselftest for RISC-V control flow integrity implementation for user mode. There is not a lot going on in the kernel to enable landing pad for user mode. CFI selftests are intended to be compiled with a zicfilp and zicfiss enabled compiler. This kselftest simply checks if landing pads and shadow stacks for the process are enabled or not and executes ptrace selftests on CFI. The selftest then registers a SIGSEGV signal handler. Any control flow violations are reported as SIGSEGV with si_code = SEGV_CPERR. The test will fail on receiving any SEGV_CPERR. The shadow stack part has more changes in the kernel, and thus there are separate tests for that. - Exercise 'map_shadow_stack' syscall - 'fork' test to make sure COW works for shadow stack pages - gup tests Kernel uses FOLL_FORCE when access happens to memory via /proc//mem. Not breaking that for shadow stack. - signal test. Make sure signal delivery results in token creation on shadow stack and consumes (and verifies) token on sigreturn - shadow stack protection test. attempts to write using regular store instruction on shadow stack memory must result in access faults - ptrace test: adds landing pad violation, clears ELP and continues In case the toolchain doesn't support the CFI extension, the CFI kselftest won't be built. Test output =========== """ TAP version 13 1..5 This is to ensure shadow stack is indeed enabled and working This is to ensure shadow stack is indeed enabled and working ok 1 shstk fork test ok 2 map shadow stack syscall ok 3 shadow stack gup tests ok 4 shadow stack signal tests ok 5 memory protections of shadow stack memory """ Suggested-by: Charlie Jenkins Signed-off-by: Charlie Jenkins Signed-off-by: Deepak Gupta Tested-by: Andreas Korb # QEMU, custom CVA6 Tested-by: Valentin Haudiquet Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-28-b55691eacf4f@rivosinc.com [pjw@kernel.org: updated to apply; cleaned up patch description, code comments] Signed-off-by: Paul Walmsley --- tools/testing/selftests/riscv/Makefile | 2 +- tools/testing/selftests/riscv/cfi/.gitignore | 2 + tools/testing/selftests/riscv/cfi/Makefile | 23 ++ tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 +++++ tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++++++ tools/testing/selftests/riscv/cfi/shadowstack.c | 385 ++++++++++++++++++++++++ tools/testing/selftests/riscv/cfi/shadowstack.h | 27 ++ 7 files changed, 693 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/riscv/cfi/.gitignore create mode 100644 tools/testing/selftests/riscv/cfi/Makefile create mode 100644 tools/testing/selftests/riscv/cfi/cfi_rv_test.h create mode 100644 tools/testing/selftests/riscv/cfi/cfitests.c create mode 100644 tools/testing/selftests/riscv/cfi/shadowstack.c create mode 100644 tools/testing/selftests/riscv/cfi/shadowstack.h (limited to 'tools') diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 099b8c1f46f8..5671b4405a12 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector +RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector cfi else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/cfi/.gitignore b/tools/testing/selftests/riscv/cfi/.gitignore new file mode 100644 index 000000000000..c1faf7ca4346 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/.gitignore @@ -0,0 +1,2 @@ +cfitests +shadowstack diff --git a/tools/testing/selftests/riscv/cfi/Makefile b/tools/testing/selftests/riscv/cfi/Makefile new file mode 100644 index 000000000000..96a4dc4b69c3 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/Makefile @@ -0,0 +1,23 @@ +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -I$(top_srcdir)/tools/include + +CFLAGS += -march=rv64gc_zicfilp_zicfiss -fcf-protection=full + +# Check for zicfi* extensions needs cross compiler +# which is not set until lib.mk is included +ifeq ($(LLVM)$(CC),cc) +CC := $(CROSS_COMPILE)gcc +endif + + +ifeq ($(shell $(CC) $(CFLAGS) -nostdlib -xc /dev/null -o /dev/null > /dev/null 2>&1; echo $$?),0) +TEST_GEN_PROGS := cfitests + +$(OUTPUT)/cfitests: cfitests.c shadowstack.c + $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ +else + +$(shell echo "Toolchain doesn't support CFI, skipping CFI kselftest." >&2) +endif + +include ../../lib.mk diff --git a/tools/testing/selftests/riscv/cfi/cfi_rv_test.h b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h new file mode 100644 index 000000000000..1c8043f2b778 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_RISCV_CFI_H +#define SELFTEST_RISCV_CFI_H +#include +#include +#include "shadowstack.h" + +#define CHILD_EXIT_CODE_SSWRITE 10 +#define CHILD_EXIT_CODE_SIG_TEST 11 + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + \ + __asm__ volatile( \ + "ecall\n" \ + : "+r" \ + (_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + \ + __asm__ volatile( \ + "ecall\n" \ + : "+r" (_arg1) \ + : "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#ifndef __NR_prctl +#define __NR_prctl 167 +#endif + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +#define CSR_SSP 0x011 + +#ifdef __ASSEMBLY__ +#define __ASM_STR(x) x +#else +#define __ASM_STR(x) #x +#endif + +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) \ + : "=r" (__v) : \ + : "memory"); \ + __v; \ +}) + +#define csr_write(csr, val) \ +({ \ + unsigned long __v = (unsigned long)(val); \ + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" \ + : : "rK" (__v) \ + : "memory"); \ +}) + +#endif diff --git a/tools/testing/selftests/riscv/cfi/cfitests.c b/tools/testing/selftests/riscv/cfi/cfitests.c new file mode 100644 index 000000000000..298544854415 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfitests.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "../../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cfi_rv_test.h" + +/* do not optimize cfi related test functions */ +#pragma GCC push_options +#pragma GCC optimize("O0") + +void sigsegv_handler(int signum, siginfo_t *si, void *uc) +{ + struct ucontext *ctx = (struct ucontext *)uc; + + if (si->si_code == SEGV_CPERR) { + ksft_print_msg("Control flow violation happened somewhere\n"); + ksft_print_msg("PC where violation happened %lx\n", ctx->uc_mcontext.gregs[0]); + exit(-1); + } + + /* all other cases are expected to be of shadow stack write case */ + exit(CHILD_EXIT_CODE_SSWRITE); +} + +bool register_signal_handler(void) +{ + struct sigaction sa = {}; + + sa.sa_sigaction = sigsegv_handler; + sa.sa_flags = SA_SIGINFO; + if (sigaction(SIGSEGV, &sa, NULL)) { + ksft_print_msg("Registering signal handler for landing pad violation failed\n"); + return false; + } + + return true; +} + +long ptrace(int request, pid_t pid, void *addr, void *data); + +bool cfi_ptrace_test(void) +{ + pid_t pid; + int status, ret = 0; + unsigned long ptrace_test_num = 0, total_ptrace_tests = 2; + + struct user_cfi_state cfi_reg; + struct iovec iov; + + pid = fork(); + + if (pid == -1) { + ksft_exit_fail_msg("%s: fork failed\n", __func__); + exit(1); + } + + if (pid == 0) { + /* allow to be traced */ + ptrace(PTRACE_TRACEME, 0, NULL, NULL); + raise(SIGSTOP); + asm volatile ("la a5, 1f\n" + "jalr a5\n" + "nop\n" + "nop\n" + "1: nop\n" + : : : "a5"); + exit(11); + /* child shouldn't go beyond here */ + } + + /* parent's code goes here */ + iov.iov_base = &cfi_reg; + iov.iov_len = sizeof(cfi_reg); + + while (ptrace_test_num < total_ptrace_tests) { + memset(&cfi_reg, 0, sizeof(cfi_reg)); + waitpid(pid, &status, 0); + if (WIFSTOPPED(status)) { + errno = 0; + ret = ptrace(PTRACE_GETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov); + if (ret == -1 && errno) + ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__); + } else { + ksft_exit_fail_msg("%s: child didn't stop, failed\n", __func__); + } + + switch (ptrace_test_num) { +#define CFI_ENABLE_MASK (PTRACE_CFI_LP_EN_STATE | \ + PTRACE_CFI_SS_EN_STATE | \ + PTRACE_CFI_SS_PTR_STATE) + case 0: + if ((cfi_reg.cfi_status.cfi_state & CFI_ENABLE_MASK) != CFI_ENABLE_MASK) + ksft_exit_fail_msg("%s: ptrace_getregset failed, %llu\n", __func__, + cfi_reg.cfi_status.cfi_state); + if (!cfi_reg.shstk_ptr) + ksft_exit_fail_msg("%s: NULL shadow stack pointer, test failed\n", + __func__); + break; + case 1: + if (!(cfi_reg.cfi_status.cfi_state & PTRACE_CFI_ELP_STATE)) + ksft_exit_fail_msg("%s: elp must have been set\n", __func__); + /* clear elp state. not interested in anything else */ + cfi_reg.cfi_status.cfi_state = 0; + + ret = ptrace(PTRACE_SETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov); + if (ret == -1 && errno) + ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__); + break; + default: + ksft_exit_fail_msg("%s: unreachable switch case\n", __func__); + break; + } + ptrace(PTRACE_CONT, pid, NULL, NULL); + ptrace_test_num++; + } + + waitpid(pid, &status, 0); + if (WEXITSTATUS(status) != 11) + ksft_print_msg("%s, bad return code from child\n", __func__); + + ksft_print_msg("%s, ptrace test succeeded\n", __func__); + return true; +} + +int main(int argc, char *argv[]) +{ + int ret = 0; + unsigned long lpad_status = 0, ss_status = 0; + + ksft_print_header(); + + ksft_print_msg("Starting risc-v tests\n"); + + /* + * Landing pad test. Not a lot of kernel changes to support landing + * pads for user mode except lighting up a bit in senvcfg via a prctl. + * Enable landing pad support throughout the execution of the test binary. + */ + ret = my_syscall5(__NR_prctl, PR_GET_INDIR_BR_LP_STATUS, &lpad_status, 0, 0, 0); + if (ret) + ksft_exit_fail_msg("Get landing pad status failed with %d\n", ret); + + if (!(lpad_status & PR_INDIR_BR_LP_ENABLE)) + ksft_exit_fail_msg("Landing pad is not enabled, should be enabled via glibc\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) + ksft_exit_fail_msg("Get shadow stack failed with %d\n", ret); + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_exit_fail_msg("Shadow stack is not enabled, should be enabled via glibc\n"); + + if (!register_signal_handler()) + ksft_exit_fail_msg("Registering signal handler for SIGSEGV failed\n"); + + ksft_print_msg("Landing pad and shadow stack are enabled for binary\n"); + cfi_ptrace_test(); + + execute_shadow_stack_tests(); + + return 0; +} + +#pragma GCC pop_options diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.c b/tools/testing/selftests/riscv/cfi/shadowstack.c new file mode 100644 index 000000000000..f8eed8260a12 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/shadowstack.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "../../kselftest.h" +#include +#include +#include +#include +#include +#include "shadowstack.h" +#include "cfi_rv_test.h" + +static struct shadow_stack_tests shstk_tests[] = { + { "shstk fork test\n", shadow_stack_fork_test }, + { "map shadow stack syscall\n", shadow_stack_map_test }, + { "shadow stack gup tests\n", shadow_stack_gup_tests }, + { "shadow stack signal tests\n", shadow_stack_signal_test}, + { "memory protections of shadow stack memory\n", shadow_stack_protection_test } +}; + +#define RISCV_SHADOW_STACK_TESTS ARRAY_SIZE(shstk_tests) + +/* do not optimize shadow stack related test functions */ +#pragma GCC push_options +#pragma GCC optimize("O0") + +void zar(void) +{ + unsigned long ssp = 0; + + ssp = csr_read(CSR_SSP); + ksft_print_msg("Spewing out shadow stack ptr: %lx\n" + " This is to ensure shadow stack is indeed enabled and working\n", + ssp); +} + +void bar(void) +{ + zar(); +} + +void foo(void) +{ + bar(); +} + +void zar_child(void) +{ + unsigned long ssp = 0; + + ssp = csr_read(CSR_SSP); + ksft_print_msg("Spewing out shadow stack ptr: %lx\n" + " This is to ensure shadow stack is indeed enabled and working\n", + ssp); +} + +void bar_child(void) +{ + zar_child(); +} + +void foo_child(void) +{ + bar_child(); +} + +typedef void (call_func_ptr)(void); +/* + * call couple of functions to test push/pop. + */ +int shadow_stack_call_tests(call_func_ptr fn_ptr, bool parent) +{ + ksft_print_msg("dummy calls for sspush and sspopchk in context of %s\n", + parent ? "parent" : "child"); + + (fn_ptr)(); + + return 0; +} + +/* forks a thread, and ensure shadow stacks fork out */ +bool shadow_stack_fork_test(unsigned long test_num, void *ctx) +{ + int pid = 0, child_status = 0, parent_pid = 0, ret = 0; + unsigned long ss_status = 0; + + ksft_print_msg("Exercising shadow stack fork test\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) { + ksft_exit_skip("Shadow stack get status prctl failed with errorcode %d\n", ret); + return false; + } + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_exit_skip("Shadow stack is not enabled, should be enabled via glibc\n"); + + parent_pid = getpid(); + pid = fork(); + + if (pid) { + ksft_print_msg("Parent pid %d and child pid %d\n", parent_pid, pid); + shadow_stack_call_tests(&foo, true); + } else { + shadow_stack_call_tests(&foo_child, false); + } + + if (pid) { + ksft_print_msg("Waiting on child to finish\n"); + wait(&child_status); + } else { + /* exit child gracefully */ + exit(0); + } + + if (pid && WIFSIGNALED(child_status)) { + ksft_print_msg("Child faulted, fork test failed\n"); + return false; + } + + return true; +} + +/* exercise 'map_shadow_stack', pivot to it and call some functions to ensure it works */ +#define SHADOW_STACK_ALLOC_SIZE 4096 +bool shadow_stack_map_test(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr; + int ret = 0; + + ksft_print_msg("Exercising shadow stack map test\n"); + + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", + (int)shdw_addr); + return false; + } + + ret = munmap((void *)shdw_addr, SHADOW_STACK_ALLOC_SIZE); + + if (ret) { + ksft_print_msg("munmap failed with error code %d\n", ret); + return false; + } + + return true; +} + +/* + * shadow stack protection tests. map a shadow stack and + * validate all memory protections work on it + */ +bool shadow_stack_protection_test(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr; + unsigned long *write_addr = NULL; + int ret = 0, pid = 0, child_status = 0; + + ksft_print_msg("Exercising shadow stack protection test (WPT)\n"); + + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", + (int)shdw_addr); + return false; + } + + write_addr = (unsigned long *)shdw_addr; + pid = fork(); + + /* no child was created, return false */ + if (pid == -1) + return false; + + /* + * try to perform a store from child on shadow stack memory + * it should result in SIGSEGV + */ + if (!pid) { + /* below write must lead to SIGSEGV */ + *write_addr = 0xdeadbeef; + } else { + wait(&child_status); + } + + /* test fail, if 0xdeadbeef present on shadow stack address */ + if (*write_addr == 0xdeadbeef) { + ksft_print_msg("Shadow stack WPT failed\n"); + return false; + } + + /* if child reached here, then fail */ + if (!pid) { + ksft_print_msg("Shadow stack WPT failed: child reached unreachable state\n"); + return false; + } + + /* if child exited via signal handler but not for write on ss */ + if (WIFEXITED(child_status) && + WEXITSTATUS(child_status) != CHILD_EXIT_CODE_SSWRITE) { + ksft_print_msg("Shadow stack WPT failed: child wasn't signaled for write\n"); + return false; + } + + ret = munmap(write_addr, SHADOW_STACK_ALLOC_SIZE); + if (ret) { + ksft_print_msg("Shadow stack WPT failed: munmap failed, error code %d\n", + ret); + return false; + } + + return true; +} + +#define SS_MAGIC_WRITE_VAL 0xbeefdead + +int gup_tests(int mem_fd, unsigned long *shdw_addr) +{ + unsigned long val = 0; + + lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET); + if (read(mem_fd, &val, sizeof(val)) < 0) { + ksft_print_msg("Reading shadow stack mem via gup failed\n"); + return 1; + } + + val = SS_MAGIC_WRITE_VAL; + lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET); + if (write(mem_fd, &val, sizeof(val)) < 0) { + ksft_print_msg("Writing shadow stack mem via gup failed\n"); + return 1; + } + + if (*shdw_addr != SS_MAGIC_WRITE_VAL) { + ksft_print_msg("GUP write to shadow stack memory failed\n"); + return 1; + } + + return 0; +} + +bool shadow_stack_gup_tests(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr = 0; + unsigned long *write_addr = NULL; + int fd = 0; + bool ret = false; + + ksft_print_msg("Exercising shadow stack gup tests\n"); + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", (int)shdw_addr); + return false; + } + + write_addr = (unsigned long *)shdw_addr; + + fd = open("/proc/self/mem", O_RDWR); + if (fd == -1) + return false; + + if (gup_tests(fd, write_addr)) { + ksft_print_msg("gup tests failed\n"); + goto out; + } + + ret = true; +out: + if (shdw_addr && munmap(write_addr, SHADOW_STACK_ALLOC_SIZE)) { + ksft_print_msg("munmap failed with error code %d\n", ret); + ret = false; + } + + return ret; +} + +volatile bool break_loop; + +void sigusr1_handler(int signo) +{ + break_loop = true; +} + +bool sigusr1_signal_test(void) +{ + struct sigaction sa = {}; + + sa.sa_handler = sigusr1_handler; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) { + ksft_print_msg("Registering signal handler for SIGUSR1 failed\n"); + return false; + } + + return true; +} + +/* + * shadow stack signal test. shadow stack must be enabled. + * register a signal, fork another thread which is waiting + * on signal. Send a signal from parent to child, verify + * that signal was received by child. If not test fails + */ +bool shadow_stack_signal_test(unsigned long test_num, void *ctx) +{ + int pid = 0, child_status = 0, ret = 0; + unsigned long ss_status = 0; + + ksft_print_msg("Exercising shadow stack signal test\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) { + ksft_print_msg("Shadow stack get status prctl failed with errorcode %d\n", ret); + return false; + } + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_print_msg("Shadow stack is not enabled, should be enabled via glibc\n"); + + /* this should be caught by signal handler and do an exit */ + if (!sigusr1_signal_test()) { + ksft_print_msg("Registering sigusr1 handler failed\n"); + exit(-1); + } + + pid = fork(); + + if (pid == -1) { + ksft_print_msg("Signal test: fork failed\n"); + goto out; + } + + if (pid == 0) { + while (!break_loop) + sleep(1); + + exit(11); + /* child shouldn't go beyond here */ + } + + /* send SIGUSR1 to child */ + kill(pid, SIGUSR1); + wait(&child_status); + +out: + + return (WIFEXITED(child_status) && + WEXITSTATUS(child_status) == 11); +} + +int execute_shadow_stack_tests(void) +{ + int ret = 0; + unsigned long test_count = 0; + unsigned long shstk_status = 0; + bool test_pass = false; + + ksft_print_msg("Executing RISC-V shadow stack self tests\n"); + ksft_set_plan(RISCV_SHADOW_STACK_TESTS); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &shstk_status, 0, 0, 0); + + if (ret != 0) + ksft_exit_fail_msg("Get shadow stack status failed with %d\n", ret); + + /* + * If we are here that means get shadow stack status succeeded and + * thus shadow stack support is baked in the kernel. + */ + while (test_count < RISCV_SHADOW_STACK_TESTS) { + test_pass = (*shstk_tests[test_count].t_func)(test_count, NULL); + ksft_test_result(test_pass, shstk_tests[test_count].name); + test_count++; + } + + ksft_finished(); + + return 0; +} + +#pragma GCC pop_options diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.h b/tools/testing/selftests/riscv/cfi/shadowstack.h new file mode 100644 index 000000000000..943a3685905f --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/shadowstack.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_SHADOWSTACK_TEST_H +#define SELFTEST_SHADOWSTACK_TEST_H +#include +#include + +/* + * A CFI test returns true for success or false for fail. + * Takes a test number to index into array, and a void pointer. + */ +typedef bool (*shstk_test_func)(unsigned long test_num, void *); + +struct shadow_stack_tests { + char *name; + shstk_test_func t_func; +}; + +bool shadow_stack_fork_test(unsigned long test_num, void *ctx); +bool shadow_stack_map_test(unsigned long test_num, void *ctx); +bool shadow_stack_protection_test(unsigned long test_num, void *ctx); +bool shadow_stack_gup_tests(unsigned long test_num, void *ctx); +bool shadow_stack_signal_test(unsigned long test_num, void *ctx); + +int execute_shadow_stack_tests(void); + +#endif -- cgit v1.2.3 From 6bc85baba4b08c787a8c9ba1bb0252a83e5c5603 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 9 Jan 2026 11:21:30 -0500 Subject: xdrgen: Implement pass-through lines in specifications XDR specification files can contain lines prefixed with '%' that pass through unchanged to generated output. Traditional rpcgen removes the '%' and emits the remainder verbatim, allowing direct insertion of C includes, pragma directives, or other language- specific content into the generated code. Until now, xdrgen silently discarded these lines during parsing. This prevented specifications from including necessary headers or preprocessor directives that might be required for the generated code to compile correctly. The grammar now captures pass-through lines instead of ignoring them. A new AST node type represents pass-through content, and the AST transformer strips the leading '%' character. Definition and source generators emit pass-through content in document order, preserving the original placement within the specification. This brings xdrgen closer to feature parity with traditional rpcgen while maintaining the existing document-order processing model. Existing generated xdrgen source code has been regenerated. Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr_gen.c | 11 +++++- fs/nfsd/nfs4xdr_gen.h | 2 +- include/linux/sunrpc/xdrgen/nfs4_1.h | 11 +++++- tools/net/sunrpc/xdrgen/README | 2 -- tools/net/sunrpc/xdrgen/generators/passthru.py | 26 +++++++++++++++ tools/net/sunrpc/xdrgen/grammars/xdr.lark | 6 ++-- tools/net/sunrpc/xdrgen/subcmds/declarations.py | 4 +-- tools/net/sunrpc/xdrgen/subcmds/definitions.py | 5 ++- tools/net/sunrpc/xdrgen/subcmds/source.py | 24 +++++++++---- .../xdrgen/templates/C/passthru/definition.j2 | 3 ++ .../sunrpc/xdrgen/templates/C/passthru/source.j2 | 3 ++ tools/net/sunrpc/xdrgen/xdr_ast.py | 39 ++++++++++++++++++++-- 12 files changed, 116 insertions(+), 20 deletions(-) create mode 100644 tools/net/sunrpc/xdrgen/generators/passthru.py create mode 100644 tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 (limited to 'tools') diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c index 1e5e2243625c..ce5c36e9070a 100644 --- a/fs/nfsd/nfs4xdr_gen.c +++ b/fs/nfsd/nfs4xdr_gen.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Generated by xdrgen. Manual edits will be lost. // XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x -// XDR specification modification time: Thu Dec 25 13:44:43 2025 +// XDR specification modification time: Thu Jan 8 23:11:48 2026 #include @@ -178,6 +178,10 @@ xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_argument return xdrgen_decode_open_arguments4(xdr, ptr); } +/* + * Determine what OPEN supports. + */ + bool xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr) { @@ -190,6 +194,11 @@ xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg return xdrgen_decode_nfstime4(xdr, ptr); } +/* + * New RECOMMENDED Attribute for + * delegation caching of times + */ + static bool __maybe_unused xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr) { diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h index 47437876e803..8dfe10246506 100644 --- a/fs/nfsd/nfs4xdr_gen.h +++ b/fs/nfsd/nfs4xdr_gen.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */ +/* XDR specification modification time: Thu Jan 8 23:11:48 2026 */ #ifndef _LINUX_XDRGEN_NFS4_1_DECL_H #define _LINUX_XDRGEN_NFS4_1_DECL_H diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h index 352bffda08f7..5283739242c7 100644 --- a/include/linux/sunrpc/xdrgen/nfs4_1.h +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */ +/* XDR specification modification time: Thu Jan 8 23:11:48 2026 */ #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H #define _LINUX_XDRGEN_NFS4_1_DEF_H @@ -87,6 +87,10 @@ typedef enum open_args_createmode4 open_args_createmode4; typedef struct open_arguments4 fattr4_open_arguments; +/* + * Determine what OPEN supports. + */ + enum { FATTR4_OPEN_ARGUMENTS = 86 }; enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 }; @@ -95,6 +99,11 @@ typedef struct nfstime4 fattr4_time_deleg_access; typedef struct nfstime4 fattr4_time_deleg_modify; +/* + * New RECOMMENDED Attribute for + * delegation caching of times + */ + enum { FATTR4_TIME_DELEG_ACCESS = 84 }; enum { FATTR4_TIME_DELEG_MODIFY = 85 }; diff --git a/tools/net/sunrpc/xdrgen/README b/tools/net/sunrpc/xdrgen/README index 27218a78ab40..2cf05d1e4cd9 100644 --- a/tools/net/sunrpc/xdrgen/README +++ b/tools/net/sunrpc/xdrgen/README @@ -250,8 +250,6 @@ Add more pragma directives: Enable something like a #include to dynamically insert the content of other specification files -Properly support line-by-line pass-through via the "%" decorator - Build a unit test suite for verifying translation of XDR language into compilable code diff --git a/tools/net/sunrpc/xdrgen/generators/passthru.py b/tools/net/sunrpc/xdrgen/generators/passthru.py new file mode 100644 index 000000000000..cb17bd977f1e --- /dev/null +++ b/tools/net/sunrpc/xdrgen/generators/passthru.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# ex: set filetype=python: + +"""Generate code for XDR pass-through lines""" + +from generators import SourceGenerator, create_jinja2_environment +from xdr_ast import _XdrPassthru + + +class XdrPassthruGenerator(SourceGenerator): + """Generate source code for XDR pass-through content""" + + def __init__(self, language: str, peer: str): + """Initialize an instance of this class""" + self.environment = create_jinja2_environment(language, "passthru") + self.peer = peer + + def emit_definition(self, node: _XdrPassthru) -> None: + """Emit one pass-through line""" + template = self.environment.get_template("definition.j2") + print(template.render(content=node.content)) + + def emit_decoder(self, node: _XdrPassthru) -> None: + """Emit one pass-through line""" + template = self.environment.get_template("source.j2") + print(template.render(content=node.content)) diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark index b7c664f2acb7..1d2afff98ac5 100644 --- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark +++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark @@ -78,6 +78,9 @@ definition : constant_def | type_def | program_def | pragma_def + | passthru_def + +passthru_def : PASSTHRU // // RPC program definitions not specified in RFC 4506 @@ -115,8 +118,7 @@ decimal_constant : /[\+-]?(0|[1-9][0-9]*)/ hexadecimal_constant : /0x([a-f]|[A-F]|[0-9])+/ octal_constant : /0[0-7]+/ -PASSTHRU : "%" | "%" /.+/ -%ignore PASSTHRU +PASSTHRU : /%.*/ %import common.C_COMMENT %ignore C_COMMENT diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py index 97ffb76a02f1..ed83d48d1f68 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py +++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py @@ -10,7 +10,6 @@ from argparse import Namespace from lark import logger from lark.exceptions import VisitError -from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator from generators.header_bottom import XdrHeaderBottomGenerator from generators.header_top import XdrHeaderTopGenerator @@ -21,8 +20,7 @@ from generators.struct import XdrStructGenerator from generators.union import XdrUnionGenerator from xdr_ast import transform_parse_tree, _RpcProgram, Specification -from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer -from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion +from xdr_ast import _XdrEnum, _XdrPointer, _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate from xdr_parse import make_error_handler, XdrParseError from xdr_parse import handle_transform_error diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py index b17526a03dda..a48ca0549382 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py +++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py @@ -14,6 +14,7 @@ from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator from generators.header_bottom import XdrHeaderBottomGenerator from generators.header_top import XdrHeaderTopGenerator +from generators.passthru import XdrPassthruGenerator from generators.pointer import XdrPointerGenerator from generators.program import XdrProgramGenerator from generators.typedef import XdrTypedefGenerator @@ -21,7 +22,7 @@ from generators.struct import XdrStructGenerator from generators.union import XdrUnionGenerator from xdr_ast import transform_parse_tree, Specification -from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer +from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPassthru, _XdrPointer from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate from xdr_parse import make_error_handler, XdrParseError @@ -47,6 +48,8 @@ def emit_header_definitions(root: Specification, language: str, peer: str) -> No gen = XdrStructGenerator(language, peer) elif isinstance(definition.value, _XdrUnion): gen = XdrUnionGenerator(language, peer) + elif isinstance(definition.value, _XdrPassthru): + gen = XdrPassthruGenerator(language, peer) else: continue gen.emit_definition(definition.value) diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py index 6508563494fe..27e8767b1b58 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/source.py +++ b/tools/net/sunrpc/xdrgen/subcmds/source.py @@ -12,6 +12,7 @@ from lark.exceptions import VisitError from generators.source_top import XdrSourceTopGenerator from generators.enum import XdrEnumGenerator +from generators.passthru import XdrPassthruGenerator from generators.pointer import XdrPointerGenerator from generators.program import XdrProgramGenerator from generators.typedef import XdrTypedefGenerator @@ -19,7 +20,7 @@ from generators.struct import XdrStructGenerator from generators.union import XdrUnionGenerator from xdr_ast import transform_parse_tree, _RpcProgram, Specification -from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer +from xdr_ast import _XdrAst, _XdrEnum, _XdrPassthru, _XdrPointer from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate, set_xdr_enum_validation @@ -74,22 +75,31 @@ def generate_server_source(filename: str, root: Specification, language: str) -> gen.emit_source(filename, root) for definition in root.definitions: - emit_source_decoder(definition.value, language, "server") + if isinstance(definition.value, _XdrPassthru): + passthru_gen = XdrPassthruGenerator(language, "server") + passthru_gen.emit_decoder(definition.value) + else: + emit_source_decoder(definition.value, language, "server") for definition in root.definitions: - emit_source_encoder(definition.value, language, "server") + if not isinstance(definition.value, _XdrPassthru): + emit_source_encoder(definition.value, language, "server") def generate_client_source(filename: str, root: Specification, language: str) -> None: - """Generate server-side source code""" + """Generate client-side source code""" gen = XdrSourceTopGenerator(language, "client") gen.emit_source(filename, root) - print("") for definition in root.definitions: - emit_source_encoder(definition.value, language, "client") + if isinstance(definition.value, _XdrPassthru): + passthru_gen = XdrPassthruGenerator(language, "client") + passthru_gen.emit_decoder(definition.value) + else: + emit_source_encoder(definition.value, language, "client") for definition in root.definitions: - emit_source_decoder(definition.value, language, "client") + if not isinstance(definition.value, _XdrPassthru): + emit_source_decoder(definition.value, language, "client") # cel: todo: client needs PROC macros diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 new file mode 100644 index 000000000000..900c7516a29c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 @@ -0,0 +1,3 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +{{ content }} diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 new file mode 100644 index 000000000000..900c7516a29c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 @@ -0,0 +1,3 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +{{ content }} diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py index dc2fa9fd8ec2..14bff9477473 100644 --- a/tools/net/sunrpc/xdrgen/xdr_ast.py +++ b/tools/net/sunrpc/xdrgen/xdr_ast.py @@ -516,6 +516,13 @@ class _Pragma(_XdrAst): """Empty class for pragma directives""" +@dataclass +class _XdrPassthru(_XdrAst): + """Passthrough line to emit verbatim in output""" + + content: str + + @dataclass class Definition(_XdrAst, ast_utils.WithMeta): """Corresponds to 'definition' in the grammar""" @@ -738,14 +745,42 @@ class ParseToAst(Transformer): raise NotImplementedError("Directive not supported") return _Pragma() + def passthru_def(self, children): + """Instantiate one _XdrPassthru object""" + token = children[0] + content = token.value[1:] + return _XdrPassthru(content) + transformer = ast_utils.create_transformer(this_module, ParseToAst()) +def _merge_consecutive_passthru(definitions: List[Definition]) -> List[Definition]: + """Merge consecutive passthru definitions into single nodes""" + result = [] + i = 0 + while i < len(definitions): + if isinstance(definitions[i].value, _XdrPassthru): + lines = [definitions[i].value.content] + meta = definitions[i].meta + j = i + 1 + while j < len(definitions) and isinstance(definitions[j].value, _XdrPassthru): + lines.append(definitions[j].value.content) + j += 1 + merged = _XdrPassthru("\n".join(lines)) + result.append(Definition(meta, merged)) + i = j + else: + result.append(definitions[i]) + i += 1 + return result + + def transform_parse_tree(parse_tree): """Transform productions into an abstract syntax tree""" - - return transformer.transform(parse_tree) + ast = transformer.transform(parse_tree) + ast.definitions = _merge_consecutive_passthru(ast.definitions) + return ast def get_header_name() -> str: -- cgit v1.2.3 From c17b9046faf7d1f3b8bb992e4d53da873dc478fc Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 24 Jan 2026 23:50:12 +0900 Subject: selftests: pci_endpoint: Add BAR subrange mapping test case Add BAR_SUBRANGE_TEST to the pci_endpoint kselftest suite. The test uses the PCITEST_BAR_SUBRANGE ioctl and will skip when the chosen BAR is disabled (-ENODATA), when the endpoint/controller does not support subrange mapping (-EOPNOTSUPP), or when the BAR is reserved for the test register space (-EBUSY). Signed-off-by: Koichiro Den Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260124145012.2794108-9-den@valinux.co.jp --- .../testing/selftests/pci_endpoint/pci_endpoint_test.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c index 23aac6f97061..eecb776c33af 100644 --- a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c +++ b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c @@ -70,6 +70,23 @@ TEST_F(pci_ep_bar, BAR_TEST) EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno); } +TEST_F(pci_ep_bar, BAR_SUBRANGE_TEST) +{ + int ret; + + pci_ep_ioctl(PCITEST_SET_IRQTYPE, PCITEST_IRQ_TYPE_AUTO); + ASSERT_EQ(0, ret) TH_LOG("Can't set AUTO IRQ type"); + + pci_ep_ioctl(PCITEST_BAR_SUBRANGE, variant->barno); + if (ret == -ENODATA) + SKIP(return, "BAR is disabled"); + if (ret == -EBUSY) + SKIP(return, "BAR is test register space"); + if (ret == -EOPNOTSUPP) + SKIP(return, "Subrange map is not supported"); + EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno); +} + FIXTURE(pci_ep_basic) { int fd; -- cgit v1.2.3 From b3827c91cc9979fe04d99e016fb9c5f6260f29a0 Mon Sep 17 00:00:00 2001 From: Andre Carvalho Date: Tue, 27 Jan 2026 19:39:20 +0000 Subject: netconsole: selftests: Move netconsole selftests to separate target This patch moves netconsole selftests from drivers/net to its own target in drivers/net/netconsole. This change helps saving some resources from CI since tests in drivers/net automatically run against real hardware which are not used by netconsole tests as they rely solely on netdevsim. lib_netcons.sh is kept under drivers/net/lib since it is also used by bonding selftests. Finally, drivers/net config remains unchanged as netpoll_basic.py requires netconsole (and does leverage real HW testing). Reviewed-by: Breno Leitao Signed-off-by: Andre Carvalho Link: https://patch.msgid.link/20260127-netcons-selftest-target-v2-1-f509ab65b3bc@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/drivers/net/Makefile | 7 - .../testing/selftests/drivers/net/netcons_basic.sh | 74 ------ .../selftests/drivers/net/netcons_cmdline.sh | 65 ----- .../drivers/net/netcons_fragmented_msg.sh | 122 --------- .../selftests/drivers/net/netcons_overflow.sh | 67 ----- .../selftests/drivers/net/netcons_resume.sh | 124 ---------- .../selftests/drivers/net/netcons_sysdata.sh | 272 --------------------- .../selftests/drivers/net/netcons_torture.sh | 130 ---------- .../selftests/drivers/net/netconsole/Makefile | 19 ++ .../selftests/drivers/net/netconsole/config | 6 + .../drivers/net/netconsole/netcons_basic.sh | 74 ++++++ .../drivers/net/netconsole/netcons_cmdline.sh | 65 +++++ .../net/netconsole/netcons_fragmented_msg.sh | 122 +++++++++ .../drivers/net/netconsole/netcons_overflow.sh | 67 +++++ .../drivers/net/netconsole/netcons_resume.sh | 124 ++++++++++ .../drivers/net/netconsole/netcons_sysdata.sh | 272 +++++++++++++++++++++ .../drivers/net/netconsole/netcons_torture.sh | 130 ++++++++++ 19 files changed, 881 insertions(+), 862 deletions(-) delete mode 100755 tools/testing/selftests/drivers/net/netcons_basic.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_cmdline.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_overflow.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_resume.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_sysdata.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_torture.sh create mode 100644 tools/testing/selftests/drivers/net/netconsole/Makefile create mode 100644 tools/testing/selftests/drivers/net/netconsole/config create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index ff9e204c5f33..0caa8aee5840 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18021,7 +18021,7 @@ S: Maintained F: Documentation/networking/netconsole.rst F: drivers/net/netconsole.c F: tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh -F: tools/testing/selftests/drivers/net/netcons\* +F: tools/testing/selftests/drivers/net/netconsole/ NETDEVSIM M: Jakub Kicinski diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 56e44a98d6a5..450f13ba4cca 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -22,6 +22,7 @@ TARGETS += drivers/ntsync TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net TARGETS += drivers/net/bonding +TARGETS += drivers/net/netconsole TARGETS += drivers/net/team TARGETS += drivers/net/virtio_net TARGETS += drivers/platform/x86/intel/ifs diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 3eba569b3366..8154d6d429d3 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -15,13 +15,6 @@ TEST_PROGS := \ hds.py \ napi_id.py \ napi_threaded.py \ - netcons_basic.sh \ - netcons_cmdline.sh \ - netcons_fragmented_msg.sh \ - netcons_overflow.sh \ - netcons_resume.sh \ - netcons_sysdata.sh \ - netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh deleted file mode 100755 index 2022f3061738..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_basic.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This test creates two netdevsim virtual interfaces, assigns one of them (the -# "destination interface") to a new namespace, and assigns IP addresses to both -# interfaces. -# -# It listens on the destination interface using socat and configures a dynamic -# target on netconsole, pointing to the destination IP address. -# -# Finally, it checks whether the message was received properly on the -# destination interface. Note that this test may pollute the kernel log buffer -# (dmesg) and relies on dynamic configuration and namespaces being configured. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# The content of kmsg will be save to the following file -OUTPUT_FILE="/tmp/${TARGET}" - -# Check for basic system dependency and exit if not found -check_for_dependencies -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT - -# Run the test twice, with different format modes -for FORMAT in "basic" "extended" -do - for IP_VERSION in "ipv6" "ipv4" - do - echo "Running with target mode: ${FORMAT} (${IP_VERSION})" - # Set current loglevel to KERN_INFO(6), and default to - # KERN_NOTICE(5) - echo "6 5" > /proc/sys/kernel/printk - # Create one namespace and two interfaces - set_network "${IP_VERSION}" - # Create a dynamic target for netconsole - create_dynamic_target "${FORMAT}" - # Only set userdata for extended format - if [ "$FORMAT" == "extended" ] - then - # Set userdata "key" with the "value" value - set_user_data - fi - # Listed for netconsole port inside the namespace and - # destination interface - listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & - # Wait for socat to start and listen to the port. - wait_for_port "${NAMESPACE}" "${PORT}" "${IP_VERSION}" - # Send the message - echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" - - # Make sure the message was received in the dst part - # and exit - validate_result "${OUTPUT_FILE}" "${FORMAT}" - # kill socat in case it is still running - pkill_socat - cleanup - echo "${FORMAT} : ${IP_VERSION} : Test passed" >&2 - done -done - -trap - EXIT -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netcons_cmdline.sh deleted file mode 100755 index d1d23dc67f99..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_cmdline.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This is a selftest to test cmdline arguments on netconsole. -# It exercises loading of netconsole from cmdline instead of the dynamic -# reconfiguration. This includes parsing the long netconsole= line and all the -# flow through init_netconsole(). -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -check_netconsole_module - -modprobe netdevsim 2> /dev/null || true -rmmod netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -# check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace and network interfaces -trap do_cleanup EXIT -# Create one namespace and two interfaces -set_network - -# Run the test twice, with different cmdline parameters -for BINDMODE in "ifname" "mac" -do - echo "Running with bind mode: ${BINDMODE}" >&2 - # Create the command line for netconsole, with the configuration from - # the function above - CMDLINE=$(create_cmdline_str "${BINDMODE}") - - # The content of kmsg will be save to the following file - OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" - - # Load the module, with the cmdline set - modprobe netconsole "${CMDLINE}" - - # Listed for netconsole port inside the namespace and destination - # interface - listen_port_and_save_to "${OUTPUT_FILE}" & - # Wait for socat to start and listen to the port. - wait_local_port_listen "${NAMESPACE}" "${PORT}" udp - # Send the message - echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" - # Make sure the message was received in the dst part - # and exit - validate_msg "${OUTPUT_FILE}" - - # kill socat in case it is still running - pkill_socat - # Unload the module - rmmod netconsole - echo "${BINDMODE} : Test passed" >&2 -done - -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh deleted file mode 100755 index 4a71e01a230c..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# Test netconsole's message fragmentation functionality. -# -# When a message exceeds the maximum packet size, netconsole splits it into -# multiple fragments for transmission. This test verifies: -# - Correct fragmentation of large messages -# - Proper reassembly of fragments at the receiver -# - Preservation of userdata across fragments -# - Behavior with and without kernel release version appending -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# The content of kmsg will be save to the following file -OUTPUT_FILE="/tmp/${TARGET}" - -# set userdata to a long value. In this case, it is "1-2-3-4...50-" -USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) - -# Convert the header string in a regexp, so, we can remove -# the second header as well. -# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If -# release is appended, you might find something like:L -# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" -function header_to_regex() { - # header is everything before ; - local HEADER="${1}" - REGEX=$(echo "${HEADER}" | cut -d'=' -f1) - echo "${REGEX}=[0-9]*\/[0-9]*;" -} - -# We have two headers in the message. Remove both to get the full message, -# and extract the full message. -function extract_msg() { - local MSGFILE="${1}" - # Extract the header, which is the very first thing that arrives in the - # first list. - HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) - HEADER_REGEX=$(header_to_regex "${HEADER}") - - # Remove the two headers from the received message - # This will return the message without any header, similarly to what - # was sent. - sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" -} - -# Validate the message, which has two messages glued together. -# unwrap them to make sure all the characters were transmitted. -# File will look like the following: -# 13,468,514729715,-,ncfrag=0/1135; -# key=-13,468,514729715,-,ncfrag=967/1135; -function validate_fragmented_result() { - # Discard the netconsole headers, and assemble the full message - RCVMSG=$(extract_msg "${1}") - - # check for the main message - if ! echo "${RCVMSG}" | grep -q "${MSG}"; then - echo "Message body doesn't match." >&2 - echo "msg received=" "${RCVMSG}" >&2 - exit "${ksft_fail}" - fi - - # check userdata - if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then - echo "message userdata doesn't match" >&2 - echo "msg received=" "${RCVMSG}" >&2 - exit "${ksft_fail}" - fi - # test passed. hooray -} - -# Check for basic system dependency and exit if not found -check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network -# Create a dynamic target for netconsole -create_dynamic_target -# Set userdata "key" with the "value" value -set_user_data - - -# TEST 1: Send message and userdata. They will fragment -# ======= -MSG=$(printf -- 'MSG%.3s=' {1..150}) - -# Listen for netconsole port inside the namespace and destination interface -listen_port_and_save_to "${OUTPUT_FILE}" & -# Wait for socat to start and listen to the port. -wait_local_port_listen "${NAMESPACE}" "${PORT}" udp -# Send the message -echo "${MSG}: ${TARGET}" > /dev/kmsg -# Wait until socat saves the file to disk -busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" -# Check if the message was not corrupted -validate_fragmented_result "${OUTPUT_FILE}" - -# TEST 2: Test with smaller message, and without release appended -# ======= -MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) -# Let's disable release and test again. -disable_release_append - -listen_port_and_save_to "${OUTPUT_FILE}" & -wait_local_port_listen "${NAMESPACE}" "${PORT}" udp -echo "${MSG}: ${TARGET}" > /dev/kmsg -busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" -validate_fragmented_result "${OUTPUT_FILE}" -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh deleted file mode 100755 index 06089643b771..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_overflow.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This test verifies that users can successfully create up to -# MAX_USERDATA_ITEMS userdata entries without encountering any failures. -# -# Additionally, it tests for expected failure when attempting to exceed this -# maximum limit. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh -# This is coming from netconsole code. Check for it in drivers/net/netconsole.c -MAX_USERDATA_ITEMS=256 - -# Function to create userdata entries -function create_userdata_max_entries() { - # All these keys should be created without any error - for i in $(seq $MAX_USERDATA_ITEMS) - do - # USERDATA_KEY is used by set_user_data - USERDATA_KEY="key"${i} - set_user_data - done -} - -# Function to verify the entry limit -function verify_entry_limit() { - # Allowing the test to fail without exiting, since the next command - # will fail - set +e - mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null - ret="$?" - set -e - if [ "$ret" -eq 0 ]; - then - echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2 - ls "${NETCONS_PATH}/userdata/" >&2 - exit "${ksft_fail}" - fi -} - -# ========== # -# Start here # -# ========== # - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -check_for_dependencies - -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network -# Create a dynamic target for netconsole -create_dynamic_target -# populate the maximum number of supported keys in userdata -create_userdata_max_entries -# Verify an additional entry is not allowed -verify_entry_limit -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_resume.sh b/tools/testing/selftests/drivers/net/netcons_resume.sh deleted file mode 100755 index fc5e5e3ad3d4..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_resume.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This test validates that netconsole is able to resume a target that was -# deactivated when its interface was removed when the interface is brought -# back up. -# -# The test configures a netconsole target and then removes netdevsim module to -# cause the interface to disappear. Targets are configured via cmdline to ensure -# targets bound by interface name and mac address can be resumed. -# The test verifies that the target moved to disabled state before adding -# netdevsim and the interface back. -# -# Finally, the test verifies that the target is re-enabled automatically and -# the message is received on the destination interface. -# -# Author: Andre Carvalho - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -SAVED_SRCMAC="" # to be populated later -SAVED_DSTMAC="" # to be populated later - -modprobe netdevsim 2> /dev/null || true -rmmod netconsole 2> /dev/null || true - -check_netconsole_module - -function cleanup() { - cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" - do_cleanup - rmmod netconsole -} - -function trigger_reactivation() { - # Add back low level module - modprobe netdevsim - # Recreate namespace and two interfaces - set_network - # Restore MACs - ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ - address "${SAVED_DSTMAC}" - if [ "${BINDMODE}" == "mac" ]; then - ip link set dev "${SRCIF}" down - ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" - # Rename device in order to trigger target resume, as initial - # when device was recreated it didn't have correct mac address. - ip link set dev "${SRCIF}" name "${TARGET}" - fi -} - -function trigger_deactivation() { - # Start by storing mac addresses so we can be restored in reactivate - SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ - cat /sys/class/net/"$DSTIF"/address) - SAVED_SRCMAC=$(mac_get "${SRCIF}") - # Remove low level module - rmmod netdevsim -} - -trap cleanup EXIT - -# Run the test twice, with different cmdline parameters -for BINDMODE in "ifname" "mac" -do - echo "Running with bind mode: ${BINDMODE}" >&2 - # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) - echo "6 5" > /proc/sys/kernel/printk - - # Create one namespace and two interfaces - set_network - - # Create the command line for netconsole, with the configuration from - # the function above - CMDLINE=$(create_cmdline_str "${BINDMODE}") - - # The content of kmsg will be save to the following file - OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" - - # Load the module, with the cmdline set - modprobe netconsole "${CMDLINE}" - # Expose cmdline target in configfs - mkdir "${NETCONS_CONFIGFS}/cmdline0" - - # Target should be enabled - wait_target_state "cmdline0" "enabled" - - # Trigger deactivation by unloading netdevsim module. Target should be - # disabled. - trigger_deactivation - wait_target_state "cmdline0" "disabled" - - # Trigger reactivation by loading netdevsim, recreating the network and - # restoring mac addresses. Target should be re-enabled. - trigger_reactivation - wait_target_state "cmdline0" "enabled" - - # Listen for netconsole port inside the namespace and destination - # interface - listen_port_and_save_to "${OUTPUT_FILE}" & - # Wait for socat to start and listen to the port. - wait_local_port_listen "${NAMESPACE}" "${PORT}" udp - # Send the message - echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" - # Make sure the message was received in the dst part - # and exit - validate_msg "${OUTPUT_FILE}" - - # kill socat in case it is still running - pkill_socat - # Cleanup & unload the module - cleanup - - echo "${BINDMODE} : Test passed" >&2 -done - -trap - EXIT -exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netcons_sysdata.sh deleted file mode 100755 index baf69031089e..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_sysdata.sh +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# A test that makes sure that sysdata runtime CPU data is properly set -# when a message is sent. -# -# There are 3 different tests, every time sent using a random CPU. -# - Test #1 -# * Only enable cpu_nr sysdata feature. -# - Test #2 -# * Keep cpu_nr sysdata feature enable and enable userdata. -# - Test #3 -# * keep userdata enabled, and disable sysdata cpu_nr feature. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -# Enable the sysdata cpu_nr feature -function set_cpu_nr() { - if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]] - then - echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" -} - -# Enable the taskname to be appended to sysdata -function set_taskname() { - if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]] - then - echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled" -} - -# Enable the release to be appended to sysdata -function set_release() { - if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]] - then - echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/release_enabled" -} - -# Enable the msgid to be appended to sysdata -function set_msgid() { - if [[ ! -f "${NETCONS_PATH}/userdata/msgid_enabled" ]] - then - echo "Not able to enable msgid sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/msgid_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/msgid_enabled" -} - -# Disable the sysdata cpu_nr feature -function unset_cpu_nr() { - echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" -} - -# Once called, taskname=<..> will not be appended anymore -function unset_taskname() { - echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled" -} - -function unset_release() { - echo 0 > "${NETCONS_PATH}/userdata/release_enabled" -} - -function unset_msgid() { - echo 0 > "${NETCONS_PATH}/userdata/msgid_enabled" -} - -# Test if MSG contains sysdata -function validate_sysdata() { - # OUTPUT_FILE will contain something like: - # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM - # userdatakey=userdatavalue - # cpu=X - # taskname= - # msgid= - - # Echo is what this test uses to create the message. See runtest() - # function - SENDER="echo" - - if [ ! -f "$OUTPUT_FILE" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then - echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - # Check if cpu=XX exists in the file and matches the one used - # in taskset(1) - if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then - echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then - echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "msgid=[0-9]\+$" "${OUTPUT_FILE}"; then - echo "FAIL: 'msgid=' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - rm "${OUTPUT_FILE}" - pkill_socat -} - -function validate_release() { - RELEASE=$(uname -r) - - if [ ! -f "$OUTPUT_FILE" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then - echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi -} - -# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=` -# strings -function validate_no_sysdata() { - if [ ! -f "$OUTPUT_FILE" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then - echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "cpu=" "${OUTPUT_FILE}"; then - echo "FAIL: 'cpu= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "taskname=" "${OUTPUT_FILE}"; then - echo "FAIL: 'taskname= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "release=" "${OUTPUT_FILE}"; then - echo "FAIL: 'release= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "msgid=" "${OUTPUT_FILE}"; then - echo "FAIL: 'msgid= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - rm "${OUTPUT_FILE}" -} - -# Start socat, send the message and wait for the file to show up in the file -# system -function runtest { - # Listen for netconsole port inside the namespace and destination - # interface - listen_port_and_save_to "${OUTPUT_FILE}" & - # Wait for socat to start and listen to the port. - wait_local_port_listen "${NAMESPACE}" "${PORT}" udp - # Send the message - taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" -} - -# ========== # -# Start here # -# ========== # - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -check_for_dependencies -# This test also depends on taskset(1). Check for it before starting the test -check_for_taskset - -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network -# Create a dynamic target for netconsole -create_dynamic_target - -#==================================================== -# TEST #1 -# Send message from a random CPU -#==================================================== -# Random CPU in the system -CPU=$((RANDOM % $(nproc))) -OUTPUT_FILE="/tmp/${TARGET}_1" -MSG="Test #1 from CPU${CPU}" -# Enable the auto population of cpu_nr -set_cpu_nr -# Enable taskname to be appended to sysdata -set_taskname -set_release -set_msgid -runtest -# Make sure the message was received in the dst part -# and exit -validate_release -validate_sysdata - -#==================================================== -# TEST #2 -# This test now adds userdata together with sysdata -# =================================================== -# Get a new random CPU -CPU=$((RANDOM % $(nproc))) -OUTPUT_FILE="/tmp/${TARGET}_2" -MSG="Test #2 from CPU${CPU}" -set_user_data -runtest -validate_release -validate_sysdata - -# =================================================== -# TEST #3 -# Unset all sysdata, fail if any userdata is set -# =================================================== -CPU=$((RANDOM % $(nproc))) -OUTPUT_FILE="/tmp/${TARGET}_3" -MSG="Test #3 from CPU${CPU}" -unset_cpu_nr -unset_taskname -unset_release -unset_msgid -runtest -# At this time, cpu= shouldn't be present in the msg -validate_no_sysdata - -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh deleted file mode 100755 index 2ce9ee3719d1..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_torture.sh +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# Repeatedly send kernel messages, toggles netconsole targets on and off, -# creates and deletes targets in parallel, and toggles the source interface to -# simulate stress conditions. -# -# This test aims to verify the robustness of netconsole under dynamic -# configurations and concurrent operations. -# -# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make -# sure no issues is reported. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -# Number of times the main loop run -ITERATIONS=${1:-150} - -# Only test extended format -FORMAT="extended" -# And ipv6 only -IP_VERSION="ipv6" - -# Create, enable and delete some targets. -create_and_delete_random_target() { - COUNT=2 - RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) - - if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ - [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then - echo "Function didn't finish yet, skipping it." >&2 - return - fi - - # enable COUNT targets - for i in $(seq ${COUNT}) - do - RND_TARGET="${RND_PREFIX}"${i} - RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" - - # Basic population so the target can come up - _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" - done - - echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg - # disable them all - for i in $(seq ${COUNT}) - do - RND_TARGET="${RND_PREFIX}"${i} - RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" - if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] - then - echo 0 > "${RND_TARGET_PATH}"/enabled - fi - rmdir "${RND_TARGET_PATH}" - done -} - -# Disable and enable the target mid-air, while messages -# are being transmitted. -toggle_netcons_target() { - for i in $(seq 2) - do - if [ ! -d "${NETCONS_PATH}" ] - then - break - fi - echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true - # Try to enable a bit harder, given it might fail to enable - # Write to `enabled` might fail depending on the lock, which is - # highly contentious here - for _ in $(seq 5) - do - echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true - done - done -} - -toggle_iface(){ - ip link set "${SRCIF}" down - ip link set "${SRCIF}" up -} - -# Start here - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network "${IP_VERSION}" -# Create a dynamic target for netconsole -create_dynamic_target "${FORMAT}" - -for i in $(seq "$ITERATIONS") -do - for _ in $(seq 10) - do - echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg - done - wait - - if (( i % 30 == 0 )); then - toggle_netcons_target & - fi - - if (( i % 50 == 0 )); then - # create some targets, enable them, send msg and disable - # all in a parallel thread - create_and_delete_random_target & - fi - - if (( i % 70 == 0 )); then - toggle_iface & - fi -done -wait - -exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netconsole/Makefile b/tools/testing/selftests/drivers/net/netconsole/Makefile new file mode 100644 index 000000000000..b56c70b7e274 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := \ + ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ +# end of TEST_INCLUDES + +TEST_PROGS := \ + netcons_basic.sh \ + netcons_cmdline.sh \ + netcons_fragmented_msg.sh \ + netcons_overflow.sh \ + netcons_resume.sh \ + netcons_sysdata.sh \ + netcons_torture.sh \ +# end of TEST_PROGS + +include ../../../lib.mk + diff --git a/tools/testing/selftests/drivers/net/netconsole/config b/tools/testing/selftests/drivers/net/netconsole/config new file mode 100644 index 000000000000..a3f6b0fd44ef --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/config @@ -0,0 +1,6 @@ +CONFIG_CONFIGFS_FS=y +CONFIG_IPV6=y +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y +CONFIG_NETDEVSIM=m diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh new file mode 100755 index 000000000000..59cf10013ecd --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test creates two netdevsim virtual interfaces, assigns one of them (the +# "destination interface") to a new namespace, and assigns IP addresses to both +# interfaces. +# +# It listens on the destination interface using socat and configures a dynamic +# target on netconsole, pointing to the destination IP address. +# +# Finally, it checks whether the message was received properly on the +# destination interface. Note that this test may pollute the kernel log buffer +# (dmesg) and relies on dynamic configuration and namespaces being configured. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT + +# Run the test twice, with different format modes +for FORMAT in "basic" "extended" +do + for IP_VERSION in "ipv6" "ipv4" + do + echo "Running with target mode: ${FORMAT} (${IP_VERSION})" + # Set current loglevel to KERN_INFO(6), and default to + # KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + # Create one namespace and two interfaces + set_network "${IP_VERSION}" + # Create a dynamic target for netconsole + create_dynamic_target "${FORMAT}" + # Only set userdata for extended format + if [ "$FORMAT" == "extended" ] + then + # Set userdata "key" with the "value" value + set_user_data + fi + # Listed for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${NAMESPACE}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat + cleanup + echo "${FORMAT} : ${IP_VERSION} : Test passed" >&2 + done +done + +trap - EXIT +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh new file mode 100755 index 000000000000..96d704b8d9d9 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This is a selftest to test cmdline arguments on netconsole. +# It exercises loading of netconsole from cmdline instead of the dynamic +# reconfiguration. This includes parsing the long netconsole= line and all the +# flow through init_netconsole(). +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +check_netconsole_module + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +# check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace and network interfaces +trap do_cleanup EXIT +# Create one namespace and two interfaces +set_network + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + + # Listed for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Unload the module + rmmod netconsole + echo "${BINDMODE} : Test passed" >&2 +done + +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh new file mode 100755 index 000000000000..0dc7280c3080 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Test netconsole's message fragmentation functionality. +# +# When a message exceeds the maximum packet size, netconsole splits it into +# multiple fragments for transmission. This test verifies: +# - Correct fragmentation of large messages +# - Proper reassembly of fragments at the receiver +# - Preservation of userdata across fragments +# - Behavior with and without kernel release version appending +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# set userdata to a long value. In this case, it is "1-2-3-4...50-" +USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) + +# Convert the header string in a regexp, so, we can remove +# the second header as well. +# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If +# release is appended, you might find something like:L +# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" +function header_to_regex() { + # header is everything before ; + local HEADER="${1}" + REGEX=$(echo "${HEADER}" | cut -d'=' -f1) + echo "${REGEX}=[0-9]*\/[0-9]*;" +} + +# We have two headers in the message. Remove both to get the full message, +# and extract the full message. +function extract_msg() { + local MSGFILE="${1}" + # Extract the header, which is the very first thing that arrives in the + # first list. + HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) + HEADER_REGEX=$(header_to_regex "${HEADER}") + + # Remove the two headers from the received message + # This will return the message without any header, similarly to what + # was sent. + sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" +} + +# Validate the message, which has two messages glued together. +# unwrap them to make sure all the characters were transmitted. +# File will look like the following: +# 13,468,514729715,-,ncfrag=0/1135; +# key=-13,468,514729715,-,ncfrag=967/1135; +function validate_fragmented_result() { + # Discard the netconsole headers, and assemble the full message + RCVMSG=$(extract_msg "${1}") + + # check for the main message + if ! echo "${RCVMSG}" | grep -q "${MSG}"; then + echo "Message body doesn't match." >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + + # check userdata + if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then + echo "message userdata doesn't match" >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + # test passed. hooray +} + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Set userdata "key" with the "value" value +set_user_data + + +# TEST 1: Send message and userdata. They will fragment +# ======= +MSG=$(printf -- 'MSG%.3s=' {1..150}) + +# Listen for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +# Check if the message was not corrupted +validate_fragmented_result "${OUTPUT_FILE}" + +# TEST 2: Test with smaller message, and without release appended +# ======= +MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) +# Let's disable release and test again. +disable_release_append + +listen_port_and_save_to "${OUTPUT_FILE}" & +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +echo "${MSG}: ${TARGET}" > /dev/kmsg +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +validate_fragmented_result "${OUTPUT_FILE}" +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh new file mode 100755 index 000000000000..a8e43d08c166 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test verifies that users can successfully create up to +# MAX_USERDATA_ITEMS userdata entries without encountering any failures. +# +# Additionally, it tests for expected failure when attempting to exceed this +# maximum limit. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh +# This is coming from netconsole code. Check for it in drivers/net/netconsole.c +MAX_USERDATA_ITEMS=256 + +# Function to create userdata entries +function create_userdata_max_entries() { + # All these keys should be created without any error + for i in $(seq $MAX_USERDATA_ITEMS) + do + # USERDATA_KEY is used by set_user_data + USERDATA_KEY="key"${i} + set_user_data + done +} + +# Function to verify the entry limit +function verify_entry_limit() { + # Allowing the test to fail without exiting, since the next command + # will fail + set +e + mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null + ret="$?" + set -e + if [ "$ret" -eq 0 ]; + then + echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2 + ls "${NETCONS_PATH}/userdata/" >&2 + exit "${ksft_fail}" + fi +} + +# ========== # +# Start here # +# ========== # + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies + +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# populate the maximum number of supported keys in userdata +create_userdata_max_entries +# Verify an additional entry is not allowed +verify_entry_limit +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh new file mode 100755 index 000000000000..cb59cf436dd0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test validates that netconsole is able to resume a target that was +# deactivated when its interface was removed when the interface is brought +# back up. +# +# The test configures a netconsole target and then removes netdevsim module to +# cause the interface to disappear. Targets are configured via cmdline to ensure +# targets bound by interface name and mac address can be resumed. +# The test verifies that the target moved to disabled state before adding +# netdevsim and the interface back. +# +# Finally, the test verifies that the target is re-enabled automatically and +# the message is received on the destination interface. +# +# Author: Andre Carvalho + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +SAVED_SRCMAC="" # to be populated later +SAVED_DSTMAC="" # to be populated later + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +check_netconsole_module + +function cleanup() { + cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" + do_cleanup + rmmod netconsole +} + +function trigger_reactivation() { + # Add back low level module + modprobe netdevsim + # Recreate namespace and two interfaces + set_network + # Restore MACs + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ + address "${SAVED_DSTMAC}" + if [ "${BINDMODE}" == "mac" ]; then + ip link set dev "${SRCIF}" down + ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" + # Rename device in order to trigger target resume, as initial + # when device was recreated it didn't have correct mac address. + ip link set dev "${SRCIF}" name "${TARGET}" + fi +} + +function trigger_deactivation() { + # Start by storing mac addresses so we can be restored in reactivate + SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ + cat /sys/class/net/"$DSTIF"/address) + SAVED_SRCMAC=$(mac_get "${SRCIF}") + # Remove low level module + rmmod netdevsim +} + +trap cleanup EXIT + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + + # Create one namespace and two interfaces + set_network + + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + # Expose cmdline target in configfs + mkdir "${NETCONS_CONFIGFS}/cmdline0" + + # Target should be enabled + wait_target_state "cmdline0" "enabled" + + # Trigger deactivation by unloading netdevsim module. Target should be + # disabled. + trigger_deactivation + wait_target_state "cmdline0" "disabled" + + # Trigger reactivation by loading netdevsim, recreating the network and + # restoring mac addresses. Target should be re-enabled. + trigger_reactivation + wait_target_state "cmdline0" "enabled" + + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Cleanup & unload the module + cleanup + + echo "${BINDMODE} : Test passed" >&2 +done + +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh new file mode 100755 index 000000000000..3fb8c4afe3d2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh @@ -0,0 +1,272 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# A test that makes sure that sysdata runtime CPU data is properly set +# when a message is sent. +# +# There are 3 different tests, every time sent using a random CPU. +# - Test #1 +# * Only enable cpu_nr sysdata feature. +# - Test #2 +# * Keep cpu_nr sysdata feature enable and enable userdata. +# - Test #3 +# * keep userdata enabled, and disable sysdata cpu_nr feature. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +# Enable the sysdata cpu_nr feature +function set_cpu_nr() { + if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]] + then + echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" +} + +# Enable the taskname to be appended to sysdata +function set_taskname() { + if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]] + then + echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled" +} + +# Enable the release to be appended to sysdata +function set_release() { + if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]] + then + echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/release_enabled" +} + +# Enable the msgid to be appended to sysdata +function set_msgid() { + if [[ ! -f "${NETCONS_PATH}/userdata/msgid_enabled" ]] + then + echo "Not able to enable msgid sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/msgid_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/msgid_enabled" +} + +# Disable the sysdata cpu_nr feature +function unset_cpu_nr() { + echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" +} + +# Once called, taskname=<..> will not be appended anymore +function unset_taskname() { + echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled" +} + +function unset_release() { + echo 0 > "${NETCONS_PATH}/userdata/release_enabled" +} + +function unset_msgid() { + echo 0 > "${NETCONS_PATH}/userdata/msgid_enabled" +} + +# Test if MSG contains sysdata +function validate_sysdata() { + # OUTPUT_FILE will contain something like: + # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM + # userdatakey=userdatavalue + # cpu=X + # taskname= + # msgid= + + # Echo is what this test uses to create the message. See runtest() + # function + SENDER="echo" + + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then + echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + # Check if cpu=XX exists in the file and matches the one used + # in taskset(1) + if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then + echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then + echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "msgid=[0-9]\+$" "${OUTPUT_FILE}"; then + echo "FAIL: 'msgid=' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + rm "${OUTPUT_FILE}" + pkill_socat +} + +function validate_release() { + RELEASE=$(uname -r) + + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then + echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi +} + +# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=` +# strings +function validate_no_sysdata() { + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then + echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "cpu=" "${OUTPUT_FILE}"; then + echo "FAIL: 'cpu= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "taskname=" "${OUTPUT_FILE}"; then + echo "FAIL: 'taskname= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "release=" "${OUTPUT_FILE}"; then + echo "FAIL: 'release= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "msgid=" "${OUTPUT_FILE}"; then + echo "FAIL: 'msgid= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + rm "${OUTPUT_FILE}" +} + +# Start socat, send the message and wait for the file to show up in the file +# system +function runtest { + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +} + +# ========== # +# Start here # +# ========== # + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# This test also depends on taskset(1). Check for it before starting the test +check_for_taskset + +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target + +#==================================================== +# TEST #1 +# Send message from a random CPU +#==================================================== +# Random CPU in the system +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_1" +MSG="Test #1 from CPU${CPU}" +# Enable the auto population of cpu_nr +set_cpu_nr +# Enable taskname to be appended to sysdata +set_taskname +set_release +set_msgid +runtest +# Make sure the message was received in the dst part +# and exit +validate_release +validate_sysdata + +#==================================================== +# TEST #2 +# This test now adds userdata together with sysdata +# =================================================== +# Get a new random CPU +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_2" +MSG="Test #2 from CPU${CPU}" +set_user_data +runtest +validate_release +validate_sysdata + +# =================================================== +# TEST #3 +# Unset all sysdata, fail if any userdata is set +# =================================================== +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_3" +MSG="Test #3 from CPU${CPU}" +unset_cpu_nr +unset_taskname +unset_release +unset_msgid +runtest +# At this time, cpu= shouldn't be present in the msg +validate_no_sysdata + +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh new file mode 100755 index 000000000000..33a44adb6f8f --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" -- cgit v1.2.3 From 7b85c77585409f76609c817b760db60f3bf8fd33 Mon Sep 17 00:00:00 2001 From: Nimrod Oren Date: Wed, 28 Jan 2026 11:02:17 +0200 Subject: selftests: drv-net: rss_flow_label: skip unsupported devices The test_rss_flow_label_6only test case fails on devices that do not support IPv6 flow label hashing. Make it skip neatly, consistent with the behavior of the test_rss_flow_label case. Reviewed-by: Gal Pressman Signed-off-by: Nimrod Oren Link: https://patch.msgid.link/20260128090217.663366-1-noren@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/rss_flow_label.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py index 6fa95fe27c47..7dc80070884a 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py +++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py @@ -145,9 +145,14 @@ def test_rss_flow_label_6only(cfg): # Try to enable Flow Labels and check again, in case it leaks thru initial = _ethtool_get_cfg(cfg, "udp6") - changed = initial.replace("l", "") if "l" in initial else initial + "l" - - cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}") + no_lbl = initial.replace("l", "") + if "l" not in initial: + try: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}") + except CmdExitFailure as exc: + raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc + else: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}") restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") _check_v4_flow_types(cfg) -- cgit v1.2.3 From 15ac1adf0f84a90605121fbe4a6238b24c865f92 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:08 +0100 Subject: selftests/bpf: Add test for sleepable program tailcalls Adding test that makes sure we can't mix sleepable and non-sleepable bpf programs in the BPF_MAP_TYPE_PROG_ARRAY map and that we can do tail call in the sleepable program. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260130081208.1130204-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/tailcalls.c | 74 ++++++++++++++++++++++ .../selftests/bpf/progs/tailcall_sleepable.c | 43 +++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_sleepable.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 0ab36503c3b2..7d534fde0af9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -8,6 +8,7 @@ #include "tailcall_freplace.skel.h" #include "tc_bpf2bpf.skel.h" #include "tailcall_fail.skel.h" +#include "tailcall_sleepable.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -1653,6 +1654,77 @@ static void test_tailcall_failure() RUN_TESTS(tailcall_fail); } +noinline void uprobe_sleepable_trigger(void) +{ + asm volatile (""); +} + +static void test_tailcall_sleepable(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct tailcall_sleepable *skel; + int prog_fd, map_fd; + int err, key; + + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + /* + * Test that we can't load uprobe_normal and uprobe_sleepable_1, + * because they share tailcall map. + */ + bpf_program__set_autoload(skel->progs.uprobe_normal, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_ERR(err, "tailcall_sleepable__load")) + goto out; + + tailcall_sleepable__destroy(skel); + + /* + * Test that we can tail call from sleepable to sleepable program. + */ + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_2, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_OK(err, "tailcall_sleepable__load")) + goto out; + + /* Add sleepable uprobe_sleepable_2 to jmp_table[0]. */ + key = 0; + prog_fd = bpf_program__fd(skel->progs.uprobe_sleepable_2); + map_fd = bpf_map__fd(skel->maps.jmp_table); + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + skel->bss->my_pid = getpid(); + + /* Attach uprobe_sleepable_1 to uprobe_sleepable_trigger and hit it. */ + opts.func_name = "uprobe_sleepable_trigger"; + skel->links.uprobe_sleepable_1 = bpf_program__attach_uprobe_opts( + skel->progs.uprobe_sleepable_1, + -1, + "/proc/self/exe", + 0 /* offset */, + &opts); + if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable_1, "bpf_program__attach_uprobe_opts")) + goto out; + + uprobe_sleepable_trigger(); + ASSERT_EQ(skel->bss->executed, 1, "executed"); + +out: + tailcall_sleepable__destroy(skel); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1707,4 +1779,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_freplace(); if (test__start_subtest("tailcall_failure")) test_tailcall_failure(); + if (test__start_subtest("tailcall_sleepable")) + test_tailcall_sleepable(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_sleepable.c b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c new file mode 100644 index 000000000000..d959a9eaaa9c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" +#include "bpf_test_utils.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps"); + +SEC("?uprobe") +int uprobe_normal(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +SEC("?uprobe.s") +int uprobe_sleepable_1(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +int executed = 0; +int my_pid = 0; + +SEC("?uprobe.s") +int uprobe_sleepable_2(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + executed++; + return 0; +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From cd77618c418254b827f2a807b4c27b97088fdb52 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 30 Jan 2026 11:18:43 +0900 Subject: selftests/bpf: Make bpf get_preempt_count() work for v6.14+ kernels Recent x86 kernels export __preempt_count as a ksym, while some old kernels between v6.1 and v6.14 expose the preemption counter via pcpu_hot.preempt_count. The existing selftest helper unconditionally dereferenced __preempt_count, which breaks BPF program loading on such old kernels. Make the x86 preemption count lookup version-agnostic by: - Marking __preempt_count and pcpu_hot as weak ksyms. - Introducing a BTF-described pcpu_hot___local layout with preserve_access_index. - Selecting the appropriate access path at runtime using ksym availability and bpf_ksym_exists() and bpf_core_field_exists(). This allows a single BPF binary to run correctly across kernel versions (e.g., v6.18 vs. v6.13) without relying on compile-time version checks. Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260130021843.154885-1-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a39576c8ba04..4b7210c318dd 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -614,7 +614,13 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 -extern const int __preempt_count __ksym; +extern const int __preempt_count __ksym __weak; + +struct pcpu_hot___local { + int preempt_count; +} __attribute__((preserve_access_index)); + +extern struct pcpu_hot___local pcpu_hot __ksym __weak; #endif struct task_struct___preempt_rt { @@ -624,7 +630,19 @@ struct task_struct___preempt_rt { static inline int get_preempt_count(void) { #if defined(bpf_target_x86) - return *(int *) bpf_this_cpu_ptr(&__preempt_count); + /* By default, read the per-CPU __preempt_count. */ + if (bpf_ksym_exists(&__preempt_count)) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); + + /* + * If __preempt_count does not exist, try to read preempt_count under + * struct pcpu_hot. Between v6.1 and v6.14 -- more specifically, + * [64701838bf057, 46e8fff6d45fe), preempt_count had been managed + * under struct pcpu_hot. + */ + if (bpf_core_field_exists(pcpu_hot.preempt_count)) + return ((struct pcpu_hot___local *) + bpf_this_cpu_ptr(&pcpu_hot))->preempt_count; #elif defined(bpf_target_arm64) return bpf_get_current_task_btf()->thread_info.preempt.count; #endif -- cgit v1.2.3 From 0207f94971e72a13380e28022c86da147e8e090f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:34 +0100 Subject: selftests/bpf: Fix kprobe multi stacktrace_ips test We now include the attached function in the stack trace, fixing the test accordingly. Fixes: c9e208fa93cd ("selftests/bpf: Add stacktrace ips test for kprobe_multi/kretprobe_multi") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-4-jolsa@kernel.org --- .../testing/selftests/bpf/prog_tests/stacktrace_ips.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c9efdd2a5b18..c93718dafd9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -74,11 +74,20 @@ static void test_stacktrace_ips_kprobe_multi(bool retprobe) load_kallsyms(); - check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, - ksym_get_addr("bpf_testmod_stacktrace_test_3"), - ksym_get_addr("bpf_testmod_stacktrace_test_2"), - ksym_get_addr("bpf_testmod_stacktrace_test_1"), - ksym_get_addr("bpf_testmod_test_read")); + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } cleanup: stacktrace_ips__destroy(skel); -- cgit v1.2.3 From 7373f97e868ad01fc73de8e7b71834eeba25d4f1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:35 +0100 Subject: selftests/bpf: Add stacktrace ips test for kprobe/kretprobe Adding test that attaches kprobe/kretprobe and verifies the ORC stacktrace matches expected functions. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-5-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 50 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/stacktrace_ips.c | 7 +++ 2 files changed, 57 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c93718dafd9b..852830536109 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -137,6 +137,52 @@ cleanup: stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_kprobe(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_test = bpf_program__attach_kprobe_opts( + skel->progs.kprobe_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_test, "bpf_program__attach_kprobe_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -145,6 +191,10 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe_multi(true); if (test__start_subtest("raw_tp")) test_stacktrace_ips_raw_tp(); + if (test__start_subtest("kprobe")) + test_stacktrace_ips_kprobe(false); + if (test__start_subtest("kretprobe")) + test_stacktrace_ips_kprobe(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index a96c8150d7f5..cae077a4061b 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -31,6 +31,13 @@ int unused(void) __u32 stack_key; +SEC("kprobe") +int kprobe_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + SEC("kprobe.multi") int kprobe_multi_test(struct pt_regs *ctx) { -- cgit v1.2.3 From e5d532be4a3b0d1f0a9210c0da2c04a6b4605904 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:36 +0100 Subject: selftests/bpf: Add stacktrace ips test for fentry/fexit Adding test that attaches fentry/fexitand verifies the ORC stacktrace matches expected functions. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-6-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 51 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/stacktrace_ips.c | 20 +++++++++ 2 files changed, 71 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index 852830536109..da42b00e3d1f 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -183,6 +183,53 @@ cleanup: stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_trampoline(bool retprobe) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + if (retprobe) { + skel->links.fexit_test = bpf_program__attach_trace(skel->progs.fexit_test); + if (!ASSERT_OK_PTR(skel->links.fexit_test, "bpf_program__attach_trace")) + goto cleanup; + } else { + skel->links.fentry_test = bpf_program__attach_trace(skel->progs.fentry_test); + if (!ASSERT_OK_PTR(skel->links.fentry_test, "bpf_program__attach_trace")) + goto cleanup; + } + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -195,6 +242,10 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe(false); if (test__start_subtest("kretprobe")) test_stacktrace_ips_kprobe(true); + if (test__start_subtest("fentry")) + test_stacktrace_ips_trampoline(false); + if (test__start_subtest("fexit")) + test_stacktrace_ips_trampoline(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index cae077a4061b..6830f2978613 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -53,4 +53,24 @@ int rawtp_test(void *ctx) return 0; } +SEC("fentry/bpf_testmod_stacktrace_test") +int fentry_test(struct pt_regs *ctx) +{ + /* + * Skip 2 bpf_program/trampoline stack entries: + * - bpf_prog_bd1f7a949f55fb03_fentry_test + * - bpf_trampoline_182536277701 + */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + +SEC("fexit/bpf_testmod_stacktrace_test") +int fexit_test(struct pt_regs *ctx) +{ + /* Skip 2 bpf_program/trampoline stack entries, check fentry_test. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 4173b494d93a8057d3ed23e65853cd76b647f870 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:37 +0100 Subject: selftests/bpf: Allow to benchmark trigger with stacktrace Adding support to call bpf_get_stackid helper from trigger programs, so far added for kprobe multi. Adding the --stacktrace/-g option to enable it. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-7-jolsa@kernel.org --- tools/testing/selftests/bpf/bench.c | 4 ++ tools/testing/selftests/bpf/bench.h | 1 + tools/testing/selftests/bpf/benchs/bench_trigger.c | 1 + tools/testing/selftests/bpf/progs/trigger_bench.c | 46 +++++++++++++++++----- 4 files changed, 43 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index bd29bb2e6cb5..8368bd3a0665 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -265,6 +265,7 @@ static const struct argp_option opts[] = { { "verbose", 'v', NULL, 0, "Verbose debug output"}, { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, { "quiet", 'q', NULL, 0, "Be more quiet"}, + { "stacktrace", 's', NULL, 0, "Get stack trace"}, { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, "Set of CPUs for producer threads; implies --affinity"}, { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, @@ -350,6 +351,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'q': env.quiet = true; break; + case 's': + env.stacktrace = true; + break; case ARG_PROD_AFFINITY_SET: env.affinity = true; if (parse_num_list(arg, &env.prod_cpus.cpus, diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index bea323820ffb..7cf21936e7ed 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -26,6 +26,7 @@ struct env { bool list; bool affinity; bool quiet; + bool stacktrace; int consumer_cnt; int producer_cnt; int nr_cpus; diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 34018fc3927f..aeec9edd3851 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -146,6 +146,7 @@ static void setup_ctx(void) bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); ctx.skel->rodata->batch_iters = args.batch_iters; + ctx.skel->rodata->stacktrace = env.stacktrace; } static void load_ctx(void) diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2898b3749d07..4ea0422d1042 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,6 +25,34 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } +volatile const int stacktrace; + +typedef __u64 stack_trace_t[128]; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, stack_trace_t); +} stack_heap SEC(".maps"); + +static __always_inline void do_stacktrace(void *ctx) +{ + if (!stacktrace) + return; + + __u64 *ptr = bpf_map_lookup_elem(&stack_heap, &(__u32){0}); + + if (ptr) + bpf_get_stack(ctx, ptr, sizeof(stack_trace_t), 0); +} + +static __always_inline void handle(void *ctx) +{ + inc_counter(); + do_stacktrace(ctx); +} + SEC("?uprobe") int bench_trigger_uprobe(void *ctx) { @@ -81,21 +109,21 @@ int trigger_driver_kfunc(void *ctx) SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -108,7 +136,7 @@ int bench_kprobe_multi_empty(void *ctx) SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -121,34 +149,34 @@ int bench_kretprobe_multi_empty(void *ctx) SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fmod_ret/bpf_modify_return_test_tp") int bench_trigger_fmodret(void *ctx) { - inc_counter(); + handle(ctx); return -22; } SEC("?tp/bpf_test_run/bpf_trigger_tp") int bench_trigger_tp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?raw_tp/bpf_trigger_tp") int bench_trigger_rawtp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } -- cgit v1.2.3 From 3a4d8bed0b47543b2dfce0b1d714b40d68ff2f7e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:52 +0800 Subject: selftests: ublk: derive TID automatically from script name Add automatic TID derivation in test_common.sh based on the script filename. The TID is extracted by stripping the "test_" prefix and ".sh" suffix from the script name (e.g., test_loop_01.sh -> loop_01). This removes the need for each test script to manually define TID, reducing boilerplate and preventing potential mismatches between the script name and TID. Scripts can still override TID after sourcing test_common.sh if needed. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_batch_01.sh | 1 - tools/testing/selftests/ublk/test_batch_02.sh | 1 - tools/testing/selftests/ublk/test_batch_03.sh | 1 - tools/testing/selftests/ublk/test_common.sh | 5 +++++ tools/testing/selftests/ublk/test_generic_01.sh | 1 - tools/testing/selftests/ublk/test_generic_02.sh | 1 - tools/testing/selftests/ublk/test_generic_03.sh | 1 - tools/testing/selftests/ublk/test_generic_04.sh | 1 - tools/testing/selftests/ublk/test_generic_05.sh | 1 - tools/testing/selftests/ublk/test_generic_06.sh | 1 - tools/testing/selftests/ublk/test_generic_07.sh | 1 - tools/testing/selftests/ublk/test_generic_08.sh | 1 - tools/testing/selftests/ublk/test_generic_09.sh | 1 - tools/testing/selftests/ublk/test_generic_10.sh | 1 - tools/testing/selftests/ublk/test_generic_11.sh | 1 - tools/testing/selftests/ublk/test_generic_12.sh | 1 - tools/testing/selftests/ublk/test_generic_13.sh | 1 - tools/testing/selftests/ublk/test_generic_14.sh | 1 - tools/testing/selftests/ublk/test_generic_15.sh | 1 - tools/testing/selftests/ublk/test_generic_16.sh | 1 - tools/testing/selftests/ublk/test_loop_01.sh | 1 - tools/testing/selftests/ublk/test_loop_02.sh | 1 - tools/testing/selftests/ublk/test_loop_03.sh | 1 - tools/testing/selftests/ublk/test_loop_04.sh | 1 - tools/testing/selftests/ublk/test_loop_05.sh | 1 - tools/testing/selftests/ublk/test_loop_06.sh | 1 - tools/testing/selftests/ublk/test_loop_07.sh | 1 - tools/testing/selftests/ublk/test_loop_08.sh | 1 - tools/testing/selftests/ublk/test_null_01.sh | 1 - tools/testing/selftests/ublk/test_null_02.sh | 1 - tools/testing/selftests/ublk/test_null_03.sh | 1 - tools/testing/selftests/ublk/test_null_04.sh | 1 - tools/testing/selftests/ublk/test_stress_01.sh | 1 - tools/testing/selftests/ublk/test_stress_02.sh | 1 - tools/testing/selftests/ublk/test_stress_03.sh | 1 - tools/testing/selftests/ublk/test_stress_04.sh | 1 - tools/testing/selftests/ublk/test_stress_05.sh | 1 - tools/testing/selftests/ublk/test_stress_06.sh | 1 - tools/testing/selftests/ublk/test_stress_07.sh | 1 - tools/testing/selftests/ublk/test_stress_08.sh | 1 - tools/testing/selftests/ublk/test_stress_09.sh | 1 - tools/testing/selftests/ublk/test_stripe_01.sh | 1 - tools/testing/selftests/ublk/test_stripe_02.sh | 1 - tools/testing/selftests/ublk/test_stripe_03.sh | 1 - tools/testing/selftests/ublk/test_stripe_04.sh | 1 - tools/testing/selftests/ublk/test_stripe_05.sh | 1 - tools/testing/selftests/ublk/test_stripe_06.sh | 1 - 47 files changed, 5 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh index 9fa9fff5c62f..a18fb39af8be 100755 --- a/tools/testing/selftests/ublk/test_batch_01.sh +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_01" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh index b477f91359e1..7ca384d11987 100755 --- a/tools/testing/selftests/ublk/test_batch_02.sh +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_02" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh index 13a2b3d3a1b9..aca9cf144b55 100755 --- a/tools/testing/selftests/ublk/test_batch_03.sh +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_03" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 7ff6ce79d62c..bbe031c94a29 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Derive TID from script name: test__.sh -> _ +# Can be overridden in test script after sourcing this file +TID=$(basename "$0" .sh) +TID=${TID#test_} + UBLK_SKIP_CODE=4 _have_program() { diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh index 21a31cd5491a..26cf3c7ceeb5 100755 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_01" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 12920768b1a0..1d4b1d6e059c 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_02" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh index b551aa76cb0d..8934ea926762 100755 --- a/tools/testing/selftests/ublk/test_generic_03.sh +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_03" ERR_CODE=0 _prep_test "null" "check dma & segment limits for zero copy" diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index be2292822bbe..2672f9c40fa8 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_04" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 9b7f71c16d82..bda5064bc31f 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_05" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh index fd42062b7b76..14a05054fcd8 100755 --- a/tools/testing/selftests/ublk/test_generic_06.sh +++ b/tools/testing/selftests/ublk/test_generic_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_06" ERR_CODE=0 _prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server" diff --git a/tools/testing/selftests/ublk/test_generic_07.sh b/tools/testing/selftests/ublk/test_generic_07.sh index cba86451fa5e..8dcfd8978f50 100755 --- a/tools/testing/selftests/ublk/test_generic_07.sh +++ b/tools/testing/selftests/ublk/test_generic_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_07" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh index b222f3a77e12..ce88c31d6b9c 100755 --- a/tools/testing/selftests/ublk/test_generic_08.sh +++ b/tools/testing/selftests/ublk/test_generic_08.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_08" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh index bb6f77ca5522..744d0cdaa242 100755 --- a/tools/testing/selftests/ublk/test_generic_09.sh +++ b/tools/testing/selftests/ublk/test_generic_09.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_09" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh index abc11c3d416b..4b4293b9081f 100755 --- a/tools/testing/selftests/ublk/test_generic_10.sh +++ b/tools/testing/selftests/ublk/test_generic_10.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_10" ERR_CODE=0 if ! _have_feature "UPDATE_SIZE"; then diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh index d1f973c8c645..e0dc0b8fe5d6 100755 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ b/tools/testing/selftests/ublk/test_generic_11.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_11" ERR_CODE=0 ublk_run_quiesce_recover() diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index b4046201b4d9..54b81ddfe9f9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_12" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh index b7aa90b1cb74..922115aa14f4 100755 --- a/tools/testing/selftests/ublk/test_generic_13.sh +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_13" ERR_CODE=0 _prep_test "null" "check that feature list is complete" diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh index cd9b44b97c24..178443394ca5 100755 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ b/tools/testing/selftests/ublk/test_generic_14.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_14" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh index 76379362e0a2..727d0f4610d6 100755 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ b/tools/testing/selftests/ublk/test_generic_15.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_15" ERR_CODE=0 _test_partition_scan_no_hang() diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh index e08af7b685c9..42e8d2e16ec9 100755 --- a/tools/testing/selftests/ublk/test_generic_16.sh +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_16" ERR_CODE=0 _prep_test "null" "stop --safe command" diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 833fa0dbc700..338a235fd82a 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 874568b3646b..04c52454e2ec 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_02" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index c30f797c6429..6e8f649fe93d 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index b01d75b3214d..9f6774ec0de6 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_04" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh index de2141533074..2b8d99e007be 100755 --- a/tools/testing/selftests/ublk/test_loop_05.sh +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_06.sh b/tools/testing/selftests/ublk/test_loop_06.sh index 1d1a8a725502..e73f6f4844db 100755 --- a/tools/testing/selftests/ublk/test_loop_06.sh +++ b/tools/testing/selftests/ublk/test_loop_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_06" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_07.sh b/tools/testing/selftests/ublk/test_loop_07.sh index 493f3fb611a5..264d20e7c530 100755 --- a/tools/testing/selftests/ublk/test_loop_07.sh +++ b/tools/testing/selftests/ublk/test_loop_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_07" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with user copy" diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh index ca289cfb2ad4..2caa7ba748fb 100755 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -13,7 +13,6 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then exit $UBLK_SKIP_CODE fi -TID=loop_08 _prep_test "loop" "end-to-end integrity" diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index c2cb8f7a09fe..eebce8076530 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 8accd35beb55..654bdff39664 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_03.sh b/tools/testing/selftests/ublk/test_null_03.sh index 0051067b4686..29cd09f06672 100755 --- a/tools/testing/selftests/ublk/test_null_03.sh +++ b/tools/testing/selftests/ublk/test_null_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 0b0719ea33a3..7491b8c17f00 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID=null_04 _prep_test "null" "integrity params" diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7d3150f057d4..a9322ce496e9 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_01" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 4bdd921081e5..6c114194f9c9 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 3ed4c9b2d8c0..4e81ca0db758 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_03" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index efa8dc33234b..6c6f44b172bc 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_04" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 68a194144302..7e9324de2030 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh index 37188ec2e1f7..c72e5d0b14be 100755 --- a/tools/testing/selftests/ublk/test_stress_06.sh +++ b/tools/testing/selftests/ublk/test_stress_06.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh index fb061fc26d36..04c2764d5238 100755 --- a/tools/testing/selftests/ublk/test_stress_07.sh +++ b/tools/testing/selftests/ublk/test_stress_07.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh index 9abb50ee3d00..37f7d204879a 100755 --- a/tools/testing/selftests/ublk/test_stress_08.sh +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_08" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh index 87b92b0a2410..53c1e3b2ab30 100755 --- a/tools/testing/selftests/ublk/test_stress_09.sh +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_09" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index 4e4f0fdf3c9b..3bc821aadad8 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh index 5820ab2efba4..4a7d2b21a6bf 100755 --- a/tools/testing/selftests/ublk/test_stripe_02.sh +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_02" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh index 20b977e27814..a1c159d54e53 100755 --- a/tools/testing/selftests/ublk/test_stripe_03.sh +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh index 1b51ed2f1d84..0c30bd6c2b3b 100755 --- a/tools/testing/selftests/ublk/test_stripe_04.sh +++ b/tools/testing/selftests/ublk/test_stripe_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_04" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on zero copy" diff --git a/tools/testing/selftests/ublk/test_stripe_05.sh b/tools/testing/selftests/ublk/test_stripe_05.sh index 05d71951d710..6ddfa88ad226 100755 --- a/tools/testing/selftests/ublk/test_stripe_05.sh +++ b/tools/testing/selftests/ublk/test_stripe_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_06.sh b/tools/testing/selftests/ublk/test_stripe_06.sh index d06cac7626e2..a2c7bf4cc613 100755 --- a/tools/testing/selftests/ublk/test_stripe_06.sh +++ b/tools/testing/selftests/ublk/test_stripe_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_06" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on user copy" -- cgit v1.2.3 From e07a2039b6d4ae3acf8ae39b86be449b7fa18d4a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:53 +0800 Subject: selftests: ublk: add selftest for UBLK_F_NO_AUTO_PART_SCAN Add test_part_01.sh to test the UBLK_F_NO_AUTO_PART_SCAN feature flag which allows suppressing automatic partition scanning during device startup while still allowing manual partition probing. The test verifies: - Normal behavior: partitions are auto-detected without the flag - With flag: partitions are not auto-detected during START_DEV - Manual scan: blockdev --rereadpt works with the flag Also update kublk tool to support --no_auto_part_scan option and recognize the feature flag. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/kublk.c | 6 +- tools/testing/selftests/ublk/kublk.h | 3 +- tools/testing/selftests/ublk/test_part_01.sh | 104 +++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_part_01.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index e39a6f871fcc..bc5bd7d1381d 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -48,6 +48,8 @@ TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh +TEST_PROGS += test_part_01.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 2da37557e1a9..e8279c4acc40 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1615,6 +1615,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_INTEGRITY), FEAT_NAME(UBLK_F_SAFE_STOP_DEV), FEAT_NAME(UBLK_F_BATCH_IO), + FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), }; struct ublk_dev *dev; __u64 features = 0; @@ -1712,7 +1713,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); - printf("\t[--batch|-b]\n"); + printf("\t[--batch|-b] [--no_auto_part_scan]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1786,6 +1787,7 @@ int main(int argc, char *argv[]) { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, { "batch", 0, NULL, 'b'}, + { "no_auto_part_scan", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1898,6 +1900,8 @@ int main(int argc, char *argv[]) ctx.tag_size = strtoul(optarg, NULL, 0); if (!strcmp(longopts[option_idx].name, "safe")) ctx.safe_stop = 1; + if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) + ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index ca97deb5e208..1faeccaaecae 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,12 +78,13 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + unsigned int safe_stop:1; + unsigned int no_auto_part_scan:1; __u32 integrity_flags; __u8 metadata_size; __u8 pi_offset; __u8 csum_type; __u8 tag_size; - unsigned int safe_stop:1; int _evtfd; int _shmid; diff --git a/tools/testing/selftests/ublk/test_part_01.sh b/tools/testing/selftests/ublk/test_part_01.sh new file mode 100755 index 000000000000..8028f6e4b3a5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_01.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +format_backing_file() +{ + local backing_file=$1 + + # Create ublk device to write partition table + local tmp_dev=$(_add_ublk_dev -t loop "${backing_file}") + [ $? -ne 0 ] && return 1 + + # Write partition table with sfdisk + sfdisk /dev/ublkb"${tmp_dev}" > /dev/null 2>&1 < /dev/null 2>&1 + udevadm settle + + if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + "${UBLK_PROG}" del -n "${dev_id}" + return 0 +} + +if ! _have_program sfdisk || ! _have_program blockdev; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_NO_AUTO_PART_SCAN" + +if ! _have_feature "UBLK_F_NO_AUTO_PART_SCAN"; then + _cleanup_test "generic" + exit "$UBLK_SKIP_CODE" +fi + + +# Create and format backing file with partition table +_create_backfile 0 256M +format_backing_file "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test normal auto partition scan +[ "$ERR_CODE" -eq 0 ] && test_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test no auto partition scan with manual scan +[ "$ERR_CODE" -eq 0 ] && test_no_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +_cleanup_test "generic" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From 7a30d3dfea4a455d1109d5258fe332f2157071ba Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:54 +0800 Subject: selftests: ublk: rename test_generic_15 to test_part_02 This test exercises partition scanning behavior, so move it to the test_part_* group for consistency. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 +- tools/testing/selftests/ublk/test_generic_15.sh | 67 ------------------------- tools/testing/selftests/ublk/test_part_02.sh | 67 +++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 68 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_15.sh create mode 100755 tools/testing/selftests/ublk/test_part_02.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index bc5bd7d1381d..ca8588ed962c 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -22,7 +22,6 @@ TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh -TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh @@ -49,6 +48,7 @@ TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh TEST_PROGS += test_part_01.sh +TEST_PROGS += test_part_02.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh deleted file mode 100755 index 727d0f4610d6..000000000000 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -_test_partition_scan_no_hang() -{ - local recovery_flag=$1 - local expected_state=$2 - local dev_id - local state - local daemon_pid - local start_time - local elapsed - - # Create ublk device with fault_inject target and very large delay - # to simulate hang during partition table read - # --delay_us 60000000 = 60 seconds delay - # Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting - # for partition scan events to complete - if [ "$recovery_flag" = "yes" ]; then - echo "Testing partition scan with recovery support..." - dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1) - else - echo "Testing partition scan without recovery..." - dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000) - fi - - _check_add_dev "$TID" $? - - # The add command should return quickly because partition scan is async. - # Now sleep briefly to let the async partition scan work start and hit - # the delay in the fault_inject handler. - sleep 1 - - # Kill the ublk daemon while partition scan is potentially blocked - # And check state transitions properly - start_time=${SECONDS} - daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") - state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}") - elapsed=$((SECONDS - start_time)) - - # Verify the device transitioned to expected state - if [ "$state" != "${expected_state}" ]; then - echo "FAIL: Device state is $state, expected ${expected_state}" - ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 - return - fi - echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" - - # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 -} - -_prep_test "partition_scan" "verify async partition scan prevents IO hang" - -# Test 1: Without recovery support - should transition to DEAD -_test_partition_scan_no_hang "no" "DEAD" - -# Test 2: With recovery support - should transition to QUIESCED -_test_partition_scan_no_hang "yes" "QUIESCED" - -_cleanup_test "partition_scan" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh new file mode 100755 index 000000000000..727d0f4610d6 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_test_partition_scan_no_hang() +{ + local recovery_flag=$1 + local expected_state=$2 + local dev_id + local state + local daemon_pid + local start_time + local elapsed + + # Create ublk device with fault_inject target and very large delay + # to simulate hang during partition table read + # --delay_us 60000000 = 60 seconds delay + # Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting + # for partition scan events to complete + if [ "$recovery_flag" = "yes" ]; then + echo "Testing partition scan with recovery support..." + dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1) + else + echo "Testing partition scan without recovery..." + dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000) + fi + + _check_add_dev "$TID" $? + + # The add command should return quickly because partition scan is async. + # Now sleep briefly to let the async partition scan work start and hit + # the delay in the fault_inject handler. + sleep 1 + + # Kill the ublk daemon while partition scan is potentially blocked + # And check state transitions properly + start_time=${SECONDS} + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") + state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}") + elapsed=$((SECONDS - start_time)) + + # Verify the device transitioned to expected state + if [ "$state" != "${expected_state}" ]; then + echo "FAIL: Device state is $state, expected ${expected_state}" + ERR_CODE=255 + ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + return + fi + echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" + + # Clean up the device + ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +} + +_prep_test "partition_scan" "verify async partition scan prevents IO hang" + +# Test 1: Without recovery support - should transition to DEAD +_test_partition_scan_no_hang "no" "DEAD" + +# Test 2: With recovery support - should transition to QUIESCED +_test_partition_scan_no_hang "yes" "QUIESCED" + +_cleanup_test "partition_scan" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From 130975353b1548d76aa9790a4ac7e74bd2a37221 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:55 +0800 Subject: selftests: ublk: refactor test_null_04 into separate functions Encapsulate each test case in its own function that creates the device, runs checks, and deletes only that device. This avoids calling _cleanup_test multiple times. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_null_04.sh | 248 ++++++++++----------------- 1 file changed, 94 insertions(+), 154 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 7491b8c17f00..22328e0f3925 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -3,163 +3,103 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 -_prep_test "null" "integrity params" +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 -dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 8 ]; then - echo "metadata_size $metadata_size != 8" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 0 ]; then - echo "pi_tuple_size $pi_tuple_size != 0" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != nop ]; then - echo "format $format != nop" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} -dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 64 ]; then - echo "metadata_size $metadata_size != 64" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 56 ]; then - echo "pi_offset $pi_offset != 56" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 8 ]; then - echo "pi_tuple_size $pi_tuple_size != 8" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 1 ]; then - echo "device_is_integrity_capable $capable != 1" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != T10-DIF-TYPE3-IP ]; then - echo "format $format != T10-DIF-TYPE3-IP" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test +_test_metadata_only() { + local dev_id -dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 8 ]; then - echo "metadata_size $metadata_size != 8" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 8 ]; then - echo "pi_tuple_size $pi_tuple_size != 8" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != T10-DIF-TYPE1-CRC ]; then - echo "format $format != T10-DIF-TYPE1-CRC" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test + dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) + _check_add_dev "$TID" $? -dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 16 ]; then - echo "metadata_size $metadata_size != 16" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 16 ]; then - echo "pi_tuple_size $pi_tuple_size != 16" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then - echo "format $format != EXT-DIF-TYPE3-CRC64" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 8 ]; then - echo "tag_size $tag_size != 8" - _show_result $TID 255 -fi -_cleanup_test + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id -_show_result $TID 0 + dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_prep_test "null" "integrity params" + +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum + +_cleanup_test +_show_result "$TID" $ERR_CODE -- cgit v1.2.3 From 76334de7da404c385e18efb3640ed60ca77a899f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:56 +0800 Subject: selftests: ublk: disable partition scan for integrity tests The null target doesn't handle IO, so disable partition scan to avoid IO failures caused by integrity verification during the kernel's partition table read. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_null_04.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 22328e0f3925..a5599d38583a 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -21,7 +21,7 @@ _check_value() { _test_metadata_only() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && @@ -40,7 +40,7 @@ _test_metadata_only() { _test_integrity_capable_ip() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && @@ -59,7 +59,7 @@ _test_integrity_capable_ip() { _test_integrity_reftag_t10dif() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && @@ -78,7 +78,7 @@ _test_integrity_reftag_t10dif() { _test_nvme_csum() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && -- cgit v1.2.3 From 4e0d293af9e37c735aec574c1e69ed71f81f94b2 Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Fri, 30 Jan 2026 00:19:57 +0800 Subject: selftests: ublk: mark each test start and end time in dmesg Log test start and end time in dmesg, so generated log messages during the test run can be linked to specific test from the test suite. (switch to `date +%F %T`) Signed-off-by: Alexander Atanasov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bbe031c94a29..dd4eff97610a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -126,6 +126,7 @@ _prep_test() { modprobe ublk_drv > /dev/null 2>&1 UBLK_TMP=$(mktemp ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" + echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } _remove_test_files() @@ -170,6 +171,7 @@ _cleanup_test() { "${UBLK_PROG}" del -a _remove_files + echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } _have_feature() -- cgit v1.2.3 From 2feca79ef8df5505b87c00812b9ba263b92c64ed Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Fri, 30 Jan 2026 00:19:58 +0800 Subject: selftests: ublk: move test temp files into a sub directory Create and use a temporary directory for the files created during test runs. If TMPDIR environment variable is set use it as a base for the temporary directory path. TMPDIR=/mnt/scratch make run_tests and TMPDIR=/mnt/scratch ./test_generic_01.sh will place test directory under /mnt/scratch Signed-off-by: Alexander Atanasov Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index dd4eff97610a..21ba51fcc7d7 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -48,7 +48,7 @@ _create_backfile() { old_file="${UBLK_BACKFILES[$index]}" [ -f "$old_file" ] && rm -f "$old_file" - new_file=$(mktemp ublk_file_"${new_size}"_XXXXX) + new_file=$(mktemp ${UBLK_TEST_DIR}/ublk_file_"${new_size}"_XXXXX) truncate -s "${new_size}" "${new_file}" UBLK_BACKFILES["$index"]="$new_file" } @@ -65,7 +65,7 @@ _remove_files() { _create_tmp_dir() { local my_file; - my_file=$(mktemp -d ublk_dir_XXXXX) + my_file=$(mktemp -d ${UBLK_TEST_DIR}/ublk_dir_XXXXX) echo "$my_file" } @@ -124,7 +124,9 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - UBLK_TMP=$(mktemp ublk_test_XXXXX) + TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) + export UBLK_TEST_DIR=${TDIR} + UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } @@ -171,6 +173,7 @@ _cleanup_test() { "${UBLK_PROG}" del -a _remove_files + rmdir ${UBLK_TEST_DIR} echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } @@ -405,6 +408,8 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 UBLK_BACKFILES=() +UBLK_TEST_DIR=${TMPDIR:-.} export UBLK_PROG export UBLK_TEST_QUIET export UBLK_TEST_SHOW_RESULT +export UBLK_TEST_DIR -- cgit v1.2.3 From f0b5b3d6b56f8717e255406366d81bbcd3631660 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Sat, 31 Jan 2026 17:09:02 +0100 Subject: selftests/bpf: Test access from RO map from xdp_store_bytes This new test simply checks that helper bpf_xdp_store_bytes can successfully read from a read-only map. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/4fdb934a713b2d7cf133288c77f6cfefe9856440.1769875479.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_xdp.c | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp.c b/tools/testing/selftests/bpf/progs/verifier_xdp.c index 50768ed179b3..7dc9226aeb34 100644 --- a/tools/testing/selftests/bpf/progs/verifier_xdp.c +++ b/tools/testing/selftests/bpf/progs/verifier_xdp.c @@ -5,6 +5,14 @@ #include #include "bpf_misc.h" +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, __u64); + __uint(map_flags, BPF_F_RDONLY_PROG); +} map_array_ro SEC(".maps"); + SEC("xdp") __description("XDP, using ifindex from netdev") __success __retval(1) @@ -21,4 +29,31 @@ l0_%=: exit; \ : __clobber_all); } +SEC("xdp") +__description("XDP, using xdp_store_bytes from RO map") +__success __retval(0) +__naked void xdp_store_bytes_from_ro_map(void) +{ + asm volatile (" \ + r6 = r1; \ + r1 = 0; \ + *(u64*)(r10 - 8) = r1; \ + r2 = r10; \ + r2 += -8; \ + r1 = %[map_array_ro] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + r1 = r6; \ + r2 = 0; \ + r3 = r0; \ + r4 = 8; \ + call %[bpf_xdp_store_bytes]; \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm(bpf_xdp_store_bytes), + __imm_addr(map_array_ro) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:48 +0800 Subject: bpf: Add bpf_jit_supports_fsession() The added fsession does not prevent running on those architectures, that haven't added fsession support. For example, try to run fsession tests on arm64: test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec test_fsession_basic:PASS:fsession_attach 0 nsec check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14) In order to prevent such errors, add bpf_jit_supports_fsession() to guard those architectures. Fixes: 2d419c44658f ("bpf: add fsession support") Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 5 ++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 5 ++++ kernel/bpf/verifier.c | 5 ++++ .../selftests/bpf/prog_tests/fsession_test.c | 32 ++++++++++++++++------ 5 files changed, 40 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5a075e06cf45..070ba80e39d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -4112,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_supports_fsession(void) +{ + return true; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..4e1cb4f91f49 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); +bool bpf_jit_supports_fsession(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ebece600aeb..dc906dfdff94 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3144,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return false; } +bool __weak bpf_jit_supports_fsession(void) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 256cc5c1a7df..6b62b6d57175 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24828,6 +24828,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 0c4b428e1cee..a299aeb8cc2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -29,8 +29,16 @@ static void test_fsession_basic(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; err = fsession_test__attach(skel); @@ -47,8 +55,16 @@ static void test_fsession_reattach(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; /* first attach */ @@ -94,6 +110,10 @@ static void test_fsession_cookie(void) bpf_program__set_autoload(skel->progs.test6, false); err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; @@ -111,10 +131,6 @@ cleanup: void test_fsession_test(void) { -#if !defined(__x86_64__) - test__skip(); - return; -#endif if (test__start_subtest("fsession_test")) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) -- cgit v1.2.3 From 7f10da2133b18b0f1bc02d58671883537e212279 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:50 +0800 Subject: selftests/bpf: Enable get_func_args and get_func_ip tests on arm64 Allow get_func_args, and get_func_ip fsession selftests to run on arm64. Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/get_func_args_test.c | 2 +- tools/testing/selftests/bpf/progs/get_func_ip_test.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 0a3236a7a109..180ba5098ca1 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -167,7 +167,7 @@ int BPF_PROG(tp_test2) } __u64 test7_result = 0; -#ifdef __TARGET_ARCH_x86 +#if defined(bpf_target_x86) || defined(bpf_target_arm64) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test7) { diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 65f7e1f182bf..43ff836a8ed8 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -106,7 +106,7 @@ int BPF_URETPROBE(test8, int ret) __u64 test9_entry_result = 0; __u64 test9_exit_result = 0; -#ifdef __TARGET_ARCH_x86 +#if defined(bpf_target_x86) || defined(bpf_target_arm64) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test9, int a) { -- cgit v1.2.3 From 5af302a15a1d628a025a78892001fe8afea90c60 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:32 +0800 Subject: selftests: ublk: simplify UBLK_TEST_DIR handling Remove intermediate TDIR variable and set UBLK_TEST_DIR directly in _prep_test(). Remove default initialization since the directory is created dynamically when tests run. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 21ba51fcc7d7..8d298a7ee7b1 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -124,8 +124,7 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) - export UBLK_TEST_DIR=${TDIR} + UBLK_TEST_DIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg @@ -408,8 +407,6 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 UBLK_BACKFILES=() -UBLK_TEST_DIR=${TMPDIR:-.} export UBLK_PROG export UBLK_TEST_QUIET export UBLK_TEST_SHOW_RESULT -export UBLK_TEST_DIR -- cgit v1.2.3 From 842b6520e579b8bd7d6ea09937e1fb7729cce1c5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:33 +0800 Subject: selftests: ublk: refactor test_loop_08 into separate functions Encapsulate each test case in its own function for better organization and maintainability: - _setup_device(): device and backfile initialization - _test_fill_and_verify(): initial data population - _test_corrupted_reftag(): reftag corruption detection test - _test_corrupted_data(): data corruption detection test - _test_bad_apptag(): apptag mismatch detection test Also fix temp file creation to use ${UBLK_TEST_DIR}/fio_err_XXXXX instead of creating in current directory. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_loop_08.sh | 199 ++++++++++++++++----------- 1 file changed, 115 insertions(+), 84 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh index 2caa7ba748fb..aaf1f52da559 100755 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -13,98 +13,129 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then exit $UBLK_SKIP_CODE fi +ERR_CODE=0 -_prep_test "loop" "end-to-end integrity" +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" -_create_backfile 0 256M -_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) -integrity_params="--integrity_capable --integrity_reftag - --metadata_size 64 --pi_offset 56 --csum_type t10dif" -dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") -_check_add_dev $TID $? - -# 1M * (64 integrity bytes / 512 data bytes) = 128K -fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 - --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG - --filename /dev/ublkb$dev_id" -fio --name fill --rw randwrite $fio_args > /dev/null -err=$? -if [ $err != 0 ]; then - echo "fio fill failed" - _show_result $TID $err -fi +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) -fio --name verify --rw randread $fio_args > /dev/null -err=$? -if [ $err != 0 ]; then - echo "fio verify failed" - _show_result $TID $err -fi + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? -fio_err=$(mktemp fio_err_XXXXX) + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" -# Overwrite 4-byte reftag at offset 56 + 4 = 60 -dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" -dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args -err=$? -if [ $err != 0 ]; then - echo "dd corrupted_reftag failed" - rm -f "$fio_err" - _show_result $TID $err -fi -if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_reftag unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_reftag message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi -# Reset to 0 -dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args -err=$? -if [ $err != 0 ]; then - echo "dd restore corrupted_reftag failed" - rm -f "$fio_err" - _show_result $TID $err -fi + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} -dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" -dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args -err=$? -if [ $err != 0 ]; then - echo "dd corrupted_data failed" - rm -f "$fio_err" - _show_result $TID $err -fi -if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_data unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_data message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi -if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then - echo "fio bad_apptag unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio bad_apptag message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_prep_test "loop" "end-to-end integrity" + +_setup_device + +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag rm -f "$fio_err" _cleanup_test -_show_result $TID 0 +_show_result "$TID" $ERR_CODE -- cgit v1.2.3 From 92734a4f3a7a5449b0c7d0160ba658a2b665c31b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:34 +0800 Subject: selftests: ublk: add _ublk_del_dev helper function Add _ublk_del_dev() to delete a specific ublk device by ID and use it in all test scripts instead of calling UBLK_PROG directly. Also remove unused _remove_ublk_devices() function. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 13 +++++++------ tools/testing/selftests/ublk/test_generic_16.sh | 4 ++-- tools/testing/selftests/ublk/test_null_04.sh | 8 ++++---- tools/testing/selftests/ublk/test_part_02.sh | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 8d298a7ee7b1..0f1fdb0892b4 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -106,11 +106,6 @@ _check_root() { fi } -_remove_ublk_devices() { - ${UBLK_PROG} del -a - modprobe -r ublk_drv > /dev/null 2>&1 -} - _get_ublk_dev_state() { ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' } @@ -277,10 +272,16 @@ __ublk_kill_daemon() echo "$state" } -__remove_ublk_dev_return() { +_ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + _ublk_del_dev "${dev_id}" local res=$? udevadm settle return ${res} diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh index 42e8d2e16ec9..3ef367836ac5 100755 --- a/tools/testing/selftests/ublk/test_generic_16.sh +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -24,7 +24,7 @@ if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then fi # Clean up device -${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +_ublk_del_dev "${dev_id}" > /dev/null 2>&1 udevadm settle # Test 2: stop --safe on device with active opener should fail @@ -49,7 +49,7 @@ kill $dd_pid 2>/dev/null wait $dd_pid 2>/dev/null # Now device should be idle, regular delete should work -${UBLK_PROG} del -n "${dev_id}" +_ublk_del_dev "${dev_id}" udevadm settle _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index a5599d38583a..6713b280a6ff 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -34,7 +34,7 @@ _test_metadata_only() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_integrity_capable_ip() { @@ -53,7 +53,7 @@ _test_integrity_capable_ip() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_integrity_reftag_t10dif() { @@ -72,7 +72,7 @@ _test_integrity_reftag_t10dif() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_nvme_csum() { @@ -91,7 +91,7 @@ _test_nvme_csum() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _prep_test "null" "integrity params" diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh index 727d0f4610d6..acd098deda3a 100755 --- a/tools/testing/selftests/ublk/test_part_02.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -46,13 +46,13 @@ _test_partition_scan_no_hang() if [ "$state" != "${expected_state}" ]; then echo "FAIL: Device state is $state, expected ${expected_state}" ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 return fi echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 } _prep_test "partition_scan" "verify async partition scan prevents IO hang" -- cgit v1.2.3 From 2021e6109de3e97adfce262c40a657ff206ef495 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:35 +0800 Subject: selftests: ublk: track created devices for per-test cleanup Track device IDs in UBLK_DEVS array when created. Update _cleanup_test() to only delete devices created by this test instead of using 'del -a' which removes all devices. This prepares for running tests concurrently where each test should only clean up its own devices. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 0f1fdb0892b4..422882c32490 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -164,7 +164,12 @@ _check_add_dev() } _cleanup_test() { - "${UBLK_PROG}" del -a + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + while read -r dev_id; do + ${UBLK_PROG} del -n "${dev_id}" + done < "${UBLK_TEST_DIR}/.ublk_devs" + rm -f "${UBLK_TEST_DIR}/.ublk_devs" + fi _remove_files rmdir ${UBLK_TEST_DIR} @@ -205,6 +210,7 @@ _create_ublk_dev() { fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then + echo "$dev_id" >> "${UBLK_TEST_DIR}/.ublk_devs" echo "${dev_id}" else return 255 @@ -276,6 +282,11 @@ _ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" + + # Remove from tracking file + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + sed -i "/^${dev_id}$/d" "${UBLK_TEST_DIR}/.ublk_devs" + fi } __remove_ublk_dev_return() { -- cgit v1.2.3 From b6bbc3bec19efd557f888d78865b627b80b37a32 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:36 +0800 Subject: selftests: ublk: add group-based test targets Add convenient Makefile targets for running specific test groups: - run_generic, run_batch, run_null, run_loop, run_stripe, run_stress, etc. - run_all for running all tests Test groups are auto-detected from TEST_PROGS using pattern matching (test__.sh -> group), and targets are generated dynamically using define/eval templates. Supports parallel execution via JOBS variable: - JOBS=1 (default): sequential with kselftest TAP output - JOBS>1: parallel execution with xargs -P Usage examples: make run_null # Sequential execution make run_stress JOBS=4 # Parallel with 4 jobs make run_all JOBS=8 # Run all tests with 8 parallel jobs With JOBS=8, running time of `make run_all` is reduced to 2m2s from 6m5s in my test VM. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index ca8588ed962c..37e012d3a8a7 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -72,3 +72,39 @@ $(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh + +# Test groups for running subsets of tests +# JOBS=1 (default): sequential with kselftest TAP output +# JOBS>1: parallel execution with xargs -P +# Usage: make run_null JOBS=4 +JOBS ?= 1 + +# Auto-detect test groups from TEST_PROGS (test__.sh -> group) +TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ + sed 's/test_\([^_]*\)_.*/\1/' | sort -u) + +# Template for group test targets +# $(1) = group name (e.g., null, generic, stress) +define RUN_GROUP +run_$(1): all + @if [ $$(JOBS) -gt 1 ]; then \ + echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \ + xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \ + else \ + $$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \ + fi +.PHONY: run_$(1) +endef + +# Generate targets for each discovered test group +$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group)))) + +# Run all tests (parallel when JOBS>1) +run_all: all + @if [ $(JOBS) -gt 1 ]; then \ + echo $(TEST_PROGS) | tr ' ' '\n' | \ + xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \ + else \ + $(call RUN_TESTS, $(TEST_PROGS)); \ + fi +.PHONY: run_all -- cgit v1.2.3 From 64406dd2f69fe27921c7bf06088871c002cf6186 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:37 +0800 Subject: selftests: ublk: add _ublk_sleep helper for parallel execution Add _ublk_sleep() helper function that uses different sleep times depending on whether tests run in parallel or sequential mode. Usage: _ublk_sleep Export JOBS variable from Makefile so test scripts can detect parallel execution, and use _ublk_sleep in test_part_02.sh to handle the partition scan delay (1s normal, 5s parallel). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 10 ++++++++++ tools/testing/selftests/ublk/test_part_02.sh | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 37e012d3a8a7..1ceae611acb7 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -78,6 +78,7 @@ check: # JOBS>1: parallel execution with xargs -P # Usage: make run_null JOBS=4 JOBS ?= 1 +export JOBS # Auto-detect test groups from TEST_PROGS (test__.sh -> group) TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 422882c32490..bd27a6875c1a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -15,6 +15,16 @@ _have_program() { return 1 } +# Sleep with awareness of parallel execution. +# Usage: _ublk_sleep +_ublk_sleep() { + if [ "${JOBS:-1}" -gt 1 ]; then + sleep "$2" + else + sleep "$1" + fi +} + _get_disk_dev_t() { local dev_id=$1 local dev diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh index acd098deda3a..7d42ab4d6e83 100755 --- a/tools/testing/selftests/ublk/test_part_02.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -33,7 +33,7 @@ _test_partition_scan_no_hang() # The add command should return quickly because partition scan is async. # Now sleep briefly to let the async partition scan work start and hit # the delay in the fault_inject handler. - sleep 1 + _ublk_sleep 1 5 # Kill the ublk daemon while partition scan is potentially blocked # And check state transitions properly -- cgit v1.2.3 From 56a08b87f9f2a763cb5546f83b78ebe1e96260af Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:38 +0800 Subject: selftests: ublk: increase timeouts for parallel test execution When running tests in parallel with high JOBS count (e.g., JOBS=64), the existing timeouts can be insufficient due to system load: - Increase state wait loops from 20/50 to 100 iterations in _recover_ublk_dev(), __ublk_quiesce_dev(), and __ublk_kill_daemon() to handle slower state transitions under heavy load - Add --timeout=20 to udevadm settle calls to prevent indefinite hangs when udev event queue is overwhelmed by rapid device creation/deletion Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bd27a6875c1a..c3afd00783a2 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -216,7 +216,7 @@ _create_ublk_dev() { fi if [ "$settle" = "yes" ]; then - udevadm settle + udevadm settle --timeout=20 fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then @@ -240,7 +240,7 @@ _recover_ublk_dev() { local state dev_id=$(_create_ublk_dev "recover" "yes" "$@") - for ((j=0;j<20;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "LIVE" ] && break sleep 1 @@ -260,7 +260,7 @@ __ublk_quiesce_dev() return "$state" fi - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "$exp_state" ] && break sleep 1 @@ -279,7 +279,7 @@ __ublk_kill_daemon() daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") state=$(_get_ublk_dev_state "${dev_id}") - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do [ "$state" == "$exp_state" ] && break kill -9 "$daemon_pid" > /dev/null 2>&1 sleep 1 @@ -304,7 +304,7 @@ __remove_ublk_dev_return() { _ublk_del_dev "${dev_id}" local res=$? - udevadm settle + udevadm settle --timeout=20 return ${res} } -- cgit v1.2.3 From d9a36ab302b1c90d8f03a3b13538b8676eb6ed3b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:39 +0800 Subject: selftests: ublk: reorganize tests into integrity and recover groups Move integrity-focused tests into new 'integrity' group: - test_null_04.sh -> test_integrity_01.sh - test_loop_08.sh -> test_integrity_02.sh Move recovery-focused tests into new 'recover' group: - test_generic_04.sh -> test_recover_01.sh - test_generic_05.sh -> test_recover_02.sh - test_generic_11.sh -> test_recover_03.sh - test_generic_14.sh -> test_recover_04.sh Update Makefile to reflect the reorganization. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 14 ++- tools/testing/selftests/ublk/test_generic_04.sh | 44 ------- tools/testing/selftests/ublk/test_generic_05.sh | 48 -------- tools/testing/selftests/ublk/test_generic_11.sh | 43 ------- tools/testing/selftests/ublk/test_generic_14.sh | 39 ------ tools/testing/selftests/ublk/test_integrity_01.sh | 105 ++++++++++++++++ tools/testing/selftests/ublk/test_integrity_02.sh | 141 ++++++++++++++++++++++ tools/testing/selftests/ublk/test_loop_08.sh | 141 ---------------------- tools/testing/selftests/ublk/test_null_04.sh | 105 ---------------- tools/testing/selftests/ublk/test_recover_01.sh | 44 +++++++ tools/testing/selftests/ublk/test_recover_02.sh | 48 ++++++++ tools/testing/selftests/ublk/test_recover_03.sh | 43 +++++++ tools/testing/selftests/ublk/test_recover_04.sh | 39 ++++++ 13 files changed, 428 insertions(+), 426 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_04.sh delete mode 100755 tools/testing/selftests/ublk/test_generic_05.sh delete mode 100755 tools/testing/selftests/ublk/test_generic_11.sh delete mode 100755 tools/testing/selftests/ublk/test_generic_14.sh create mode 100755 tools/testing/selftests/ublk/test_integrity_01.sh create mode 100755 tools/testing/selftests/ublk/test_integrity_02.sh delete mode 100755 tools/testing/selftests/ublk/test_loop_08.sh delete mode 100755 tools/testing/selftests/ublk/test_null_04.sh create mode 100755 tools/testing/selftests/ublk/test_recover_01.sh create mode 100755 tools/testing/selftests/ublk/test_recover_02.sh create mode 100755 tools/testing/selftests/ublk/test_recover_03.sh create mode 100755 tools/testing/selftests/ublk/test_recover_04.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 1ceae611acb7..a62a06e13006 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -10,18 +10,14 @@ LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh TEST_PROGS += test_generic_02.sh TEST_PROGS += test_generic_03.sh -TEST_PROGS += test_generic_04.sh -TEST_PROGS += test_generic_05.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh TEST_PROGS += test_generic_08.sh TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh -TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh -TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh @@ -31,7 +27,6 @@ TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh -TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh @@ -39,7 +34,14 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh -TEST_PROGS += test_loop_08.sh + +TEST_PROGS += test_integrity_01.sh +TEST_PROGS += test_integrity_02.sh + +TEST_PROGS += test_recover_01.sh +TEST_PROGS += test_recover_02.sh +TEST_PROGS += test_recover_03.sh +TEST_PROGS += test_recover_04.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh deleted file mode 100755 index 2672f9c40fa8..000000000000 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_recover_test() -{ - run_io_and_recover 256M "kill_daemon" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "recover" "basic recover function verification" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_recover_test -t null -q 2 -r 1 -b & -ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 & -ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -i 1 & -ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "recover" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh deleted file mode 100755 index bda5064bc31f..000000000000 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_recover_test() -{ - run_io_and_recover 256M "kill_daemon" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_feature "ZERO_COPY"; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "recover" "basic recover function verification (zero copy)" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_recover_test -t null -q 2 -r 1 -z -b & -ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -z & -ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 & -ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "recover" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh deleted file mode 100755 index e0dc0b8fe5d6..000000000000 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_quiesce_recover() -{ - run_io_and_recover 256M "quiesce_dev" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_feature "QUIESCE"; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "quiesce" "basic quiesce & recover function verification" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_quiesce_recover -t null -q 2 -r 1 & -ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & -ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 & -ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "quiesce" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh deleted file mode 100755 index 178443394ca5..000000000000 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_recover_test() -{ - run_io_and_recover 256M "kill_daemon" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "recover" "basic recover function verification (user copy)" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_recover_test -t null -q 2 -r 1 -u & -ublk_run_recover_test -t loop -q 2 -r 1 -u "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -u "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -u -i 1 & -ublk_run_recover_test -t loop -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "recover" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_01.sh b/tools/testing/selftests/ublk/test_integrity_01.sh new file mode 100755 index 000000000000..6713b280a6ff --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_01.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 + + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} + +_test_metadata_only() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + _ublk_del_dev "${dev_id}" +} + +_prep_test "null" "integrity params" + +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_02.sh b/tools/testing/selftests/ublk/test_integrity_02.sh new file mode 100755 index 000000000000..aaf1f52da559 --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_02.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +ERR_CODE=0 + +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" + +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) + + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? + + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" + + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} + +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi + + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_prep_test "loop" "end-to-end integrity" + +_setup_device + +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag + +rm -f "$fio_err" + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh deleted file mode 100755 index aaf1f52da559..000000000000 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -if ! _have_program fio; then - exit $UBLK_SKIP_CODE -fi - -fio_version=$(fio --version) -if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then - echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" - exit $UBLK_SKIP_CODE -fi - -ERR_CODE=0 - -# Global variables set during device setup -dev_id="" -fio_args="" -fio_err="" - -_setup_device() { - _create_backfile 0 256M - _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) - - local integrity_params="--integrity_capable --integrity_reftag - --metadata_size 64 --pi_offset 56 --csum_type t10dif" - dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") - _check_add_dev "$TID" $? - - # 1M * (64 integrity bytes / 512 data bytes) = 128K - fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 - --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG - --filename /dev/ublkb$dev_id" - - fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) -} - -_test_fill_and_verify() { - fio --name fill --rw randwrite $fio_args > /dev/null - if [ $? != 0 ]; then - echo "fio fill failed" - ERR_CODE=255 - return 1 - fi - - fio --name verify --rw randread $fio_args > /dev/null - if [ $? != 0 ]; then - echo "fio verify failed" - ERR_CODE=255 - return 1 - fi -} - -_test_corrupted_reftag() { - local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" - local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" - - # Overwrite 4-byte reftag at offset 56 + 4 = 60 - dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args - if [ $? != 0 ]; then - echo "dd corrupted_reftag failed" - ERR_CODE=255 - return 1 - fi - - if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_reftag unexpectedly succeeded" - ERR_CODE=255 - return 1 - fi - - if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_reftag message not found: $expected_err" - ERR_CODE=255 - return 1 - fi - - # Reset to 0 - dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args - if [ $? != 0 ]; then - echo "dd restore corrupted_reftag failed" - ERR_CODE=255 - return 1 - fi -} - -_test_corrupted_data() { - local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" - local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" - - dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args - if [ $? != 0 ]; then - echo "dd corrupted_data failed" - ERR_CODE=255 - return 1 - fi - - if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_data unexpectedly succeeded" - ERR_CODE=255 - return 1 - fi - - if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_data message not found: $expected_err" - ERR_CODE=255 - return 1 - fi -} - -_test_bad_apptag() { - local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" - - if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then - echo "fio bad_apptag unexpectedly succeeded" - ERR_CODE=255 - return 1 - fi - - if ! grep -q "$expected_err" "$fio_err"; then - echo "fio bad_apptag message not found: $expected_err" - ERR_CODE=255 - return 1 - fi -} - -_prep_test "loop" "end-to-end integrity" - -_setup_device - -_test_fill_and_verify && \ -_test_corrupted_reftag && \ -_test_corrupted_data && \ -_test_bad_apptag - -rm -f "$fio_err" - -_cleanup_test -_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh deleted file mode 100755 index 6713b280a6ff..000000000000 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -_check_value() { - local name=$1 - local actual=$2 - local expected=$3 - - if [ "$actual" != "$expected" ]; then - echo "$name $actual != $expected" - ERR_CODE=255 - return 1 - fi - return 0 -} - -_test_metadata_only() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - - _ublk_del_dev "${dev_id}" -} - -_test_integrity_capable_ip() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - - _ublk_del_dev "${dev_id}" -} - -_test_integrity_reftag_t10dif() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - - _ublk_del_dev "${dev_id}" -} - -_test_nvme_csum() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 - - _ublk_del_dev "${dev_id}" -} - -_prep_test "null" "integrity params" - -_test_metadata_only -_test_integrity_capable_ip -_test_integrity_reftag_t10dif -_test_nvme_csum - -_cleanup_test -_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_01.sh b/tools/testing/selftests/ublk/test_recover_01.sh new file mode 100755 index 000000000000..2672f9c40fa8 --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_01.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_recover_test() +{ + run_io_and_recover 256M "kill_daemon" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "recover" "basic recover function verification" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 & +ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -i 1 & +ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "recover" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_02.sh b/tools/testing/selftests/ublk/test_recover_02.sh new file mode 100755 index 000000000000..bda5064bc31f --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_02.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_recover_test() +{ + run_io_and_recover 256M "kill_daemon" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "recover" "basic recover function verification (zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -z & +ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 & +ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "recover" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_03.sh b/tools/testing/selftests/ublk/test_recover_03.sh new file mode 100755 index 000000000000..e0dc0b8fe5d6 --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_03.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_quiesce_recover() +{ + run_io_and_recover 256M "quiesce_dev" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_feature "QUIESCE"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "quiesce" "basic quiesce & recover function verification" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_quiesce_recover -t null -q 2 -r 1 & +ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & +ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 & +ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "quiesce" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_04.sh b/tools/testing/selftests/ublk/test_recover_04.sh new file mode 100755 index 000000000000..178443394ca5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_04.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_recover_test() +{ + run_io_and_recover 256M "kill_daemon" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "recover" "basic recover function verification (user copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_recover_test -t null -q 2 -r 1 -u & +ublk_run_recover_test -t loop -q 2 -r 1 -u "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -u "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -u -i 1 & +ublk_run_recover_test -t loop -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "recover" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From 5314d25afbc44d0449fa2519d2c9d7f3c319f74c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:40 +0800 Subject: selftests: ublk: improve I/O ordering test with bpftrace Remove test_generic_01.sh since block layer may reorder I/O, making the test prone to false positives. Apply the improvements to test_generic_02.sh instead, which supposes for covering ublk dispatch io order. Rework test_generic_02 to verify that ublk dispatch doesn't reorder I/O by comparing request start order with completion order using bpftrace. The bpftrace script now: - Tracks each request's start sequence number in a map keyed by sector - On completion, verifies the request's start order matches expected completion order - Reports any out-of-order completions detected The test script: - Wait bpftrace BEGIN code block is run - Pins fio to CPU 0 for deterministic behavior - Uses block_io_start and block_rq_complete tracepoints - Checks bpftrace output for reordering errors Reported-and-tested-by: Alexander Atanasov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 3 +- tools/testing/selftests/ublk/test_generic_01.sh | 47 ------------------------- tools/testing/selftests/ublk/test_generic_02.sh | 22 ++++++++---- tools/testing/selftests/ublk/trace/seq_io.bt | 47 ++++++++++++++++++++----- 4 files changed, 54 insertions(+), 65 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_01.sh (limited to 'tools') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index a62a06e13006..8ac2d4a682a1 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -7,8 +7,7 @@ endif LDLIBS += -lpthread -lm -luring -TEST_PROGS := test_generic_01.sh -TEST_PROGS += test_generic_02.sh +TEST_PROGS := test_generic_02.sh TEST_PROGS += test_generic_03.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh deleted file mode 100755 index 26cf3c7ceeb5..000000000000 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -if ! _have_program bpftrace; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "null" "sequential io order" - -dev_id=$(_add_ublk_dev -t null) -_check_add_dev $TID $? - -dev_t=$(_get_disk_dev_t "$dev_id") -bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & -btrace_pid=$! -sleep 2 - -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then - _cleanup_test "null" - exit "$UBLK_SKIP_CODE" -fi - -# run fio over this ublk disk -fio --name=write_seq \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --bs=4k > /dev/null 2>&1 -ERR_CODE=$? -kill "$btrace_pid" -wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" - ERR_CODE=255 -fi -_cleanup_test "null" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 1d4b1d6e059c..46b657143fd6 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -13,7 +13,7 @@ if ! _have_program fio; then exit "$UBLK_SKIP_CODE" fi -_prep_test "null" "sequential io order for MQ" +_prep_test "null" "ublk dispatch won't reorder IO for MQ" dev_id=$(_add_ublk_dev -t null -q 2) _check_add_dev $TID $? @@ -21,15 +21,20 @@ _check_add_dev $TID $? dev_t=$(_get_disk_dev_t "$dev_id") bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & btrace_pid=$! -sleep 2 -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then +# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY) +for _ in $(seq 100); do + grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break + sleep 0.1 +done + +if ! kill -0 "$btrace_pid" 2>/dev/null; then _cleanup_test "null" exit "$UBLK_SKIP_CODE" fi -# run fio over this ublk disk -fio --name=write_seq \ +# run fio over this ublk disk (pinned to CPU 0) +taskset -c 0 fio --name=write_seq \ --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=16 \ --rw=write \ @@ -39,8 +44,11 @@ fio --name=write_seq \ ERR_CODE=$? kill "$btrace_pid" wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" + +# Check for out-of-order completions detected by bpftrace +if grep -q "^out_of_order:" "$UBLK_TMP"; then + echo "I/O reordering detected:" + grep "^out_of_order:" "$UBLK_TMP" ERR_CODE=255 fi _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt index b2f60a92b118..9d36ba35468f 100644 --- a/tools/testing/selftests/ublk/trace/seq_io.bt +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -2,23 +2,52 @@ $1: dev_t $2: RWBS $3: strlen($2) + + Track request order between block_io_start and block_rq_complete. + Sequence starts at 1 so 0 means "never seen". On first valid + completion, sync complete_seq to handle probe attachment races. + block_rq_complete listed first to reduce missed completion window. */ + BEGIN { - @last_rw[$1, str($2)] = (uint64)0; + @start_seq = (uint64)1; + @complete_seq = (uint64)0; + @out_of_order = (uint64)0; + @start_order[0] = (uint64)0; + delete(@start_order[0]); + printf("BPFTRACE_READY\n"); } + tracepoint:block:block_rq_complete +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ { - $dev = $1; - if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { - $last = @last_rw[$dev, str($2)]; - if ((uint64)args.sector != $last) { - printf("io_out_of_order: exp %llu actual %llu\n", - args.sector, $last); + $expected = @start_order[args.sector]; + if ($expected > 0) { + if (@complete_seq == 0) { + @complete_seq = $expected; + } + if ($expected != @complete_seq) { + printf("out_of_order: sector %llu started at seq %llu but completed at seq %llu\n", + args.sector, $expected, @complete_seq); + @out_of_order = @out_of_order + 1; } - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + delete(@start_order[args.sector]); + @complete_seq = @complete_seq + 1; } } +tracepoint:block:block_io_start +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ +{ + @start_order[args.sector] = @start_seq; + @start_seq = @start_seq + 1; +} + END { - clear(@last_rw); + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", + @start_seq - 1, @complete_seq, @out_of_order); + clear(@start_order); + clear(@start_seq); + clear(@complete_seq); + clear(@out_of_order); } -- cgit v1.2.3 From 4ac76c51709dff01b285a2d8afea80ca7ae66d28 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:16 +0000 Subject: selftests/mm: default KDIR to build directory Patch series "Various mm kselftests improvements/fixes", v3. Various improvements/fixes for the mm kselftests: - Patch 1-3 extend support for more build configurations: out-of-tree $KDIR, cross-compilation, etc. - Patch 4-7 fix issues related to faulting in pages, introducing a new helper for that purpose. - Patch 8 fixes the value returned by pagemap_ioctl (PASS was always returned, which explains why the issue fixed in patch 6 went unnoticed). - Patch 9 improves the exit code of pfnmap. Net results: - 1 test no longer fails (patch 7) - 3 tests are no longer skipped (patch 4) - More accurate return values for whole suites (patch 8, 9) - Extra tests are more likely to be built (patch 1-3) This patch (of 9): KDIR currently defaults to the running kernel's modules directory when building the page_frag module. The underlying assumption is that most users build the kselftests in order to run them against the system they're built on. This assumption seems questionable, and there is no guarantee that the module can actually be built against the running kernel. Switch the default value of KDIR to the kernel's build directory, i.e. $(O) if O= or KBUILD_OUTPUT= is used, and the source directory otherwise. This seems like the least surprising option: the test module is built against the kernel that has been previously built. Note: we can't use $(top_srcdir) in mm/Makefile because it is only defined once lib.mk is included. Link: https://lkml.kernel.org/r/20260122170224.4056513-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20260122170224.4056513-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: David Hildenbrand Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Ryan Roberts Cc: Shuah Khan Cc: Jason Gunthorpe Cc: John Hubbard Cc: Paolo Abeni Cc: SeongJae Park Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/page_frag/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eaf9312097f7..bb93101e339e 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -44,7 +44,7 @@ LDLIBS = -lrt -lpthread -lm # warnings. CFLAGS += -U_FORTIFY_SOURCE -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile index 8c8bb39ffa28..96e5f646e69b 100644 --- a/tools/testing/selftests/mm/page_frag/Makefile +++ b/tools/testing/selftests/mm/page_frag/Makefile @@ -1,5 +1,5 @@ PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../../..)) ifeq ($(V),1) Q = -- cgit v1.2.3 From 1821be740d2e9329805cafa368e476064fde0789 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:17 +0000 Subject: selftests/mm: remove flaky header check Commit 96ed62ea0298 ("mm: page_frag: fix a compile error when kernel is not compiled") introduced a check to avoid attempting to build the page_frag module if is missing. Unfortunately this check only works if KDIR points to /lib/modules/... or an in-tree kernel build. It always fails if KDIR points to an out-of-tree build (i.e. when the kernel was built with O=... make) because only generated headers are present under $KDIR/include/ in that case. A recent commit switched KDIR to default to the kernel's build directory, so that check is no longer justified. Link: https://lkml.kernel.org/r/20260122170224.4056513-3-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Mark Brown Cc: Paolo Abeni Cc: Yunsheng Lin Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index bb93101e339e..4e5c8a330a0c 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -46,12 +46,8 @@ CFLAGS += -U_FORTIFY_SOURCE KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) -ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag else -PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel" -endif -else PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first" endif -- cgit v1.2.3 From 7f532d19c8be76ad2fcd7ab6b0c9eb618f70966b Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:18 +0000 Subject: selftests/mm: pass down full CC and CFLAGS to check_config.sh check_config.sh checks that liburing is available by running the compiler provided as its first argument. This makes two assumptions: 1. CC consists of only one word 2. No extra flag is required Unfortunately, there are many situations where these assumptions don't hold. For instance: - When using Clang, CC consists of multiple words - When cross-compiling, extra flags may be required to allow the compiler to find headers Remove these assumptions by passing down CC and CFLAGS as-is from the Makefile, so that the same command line is used as when actually building the tests. Link: https://lkml.kernel.org/r/20260122170224.4056513-4-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/check_config.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 4e5c8a330a0c..de4afc34e3b1 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -230,7 +230,7 @@ $(OUTPUT)/migration: LDLIBS += -lnuma $(OUTPUT)/rmap: LDLIBS += -lnuma local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) + CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh EXTRA_CLEAN += local_config.mk local_config.h diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index 3954f4746161..b84c82bbf875 100755 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -16,8 +16,7 @@ echo "#include " > $tmpfile_c echo "#include " >> $tmpfile_c echo "int func(void) { return 0; }" >> $tmpfile_c -CC=${1:?"Usage: $0 # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 +$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE -- cgit v1.2.3 From bce1dabd310e87fefe0645fec9ba98b84d37e418 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:19 +0000 Subject: selftests/mm: fix usage of FORCE_READ() in cow tests Commit 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly") modified FORCE_READ() to take a value instead of a pointer. It also changed most of the call sites accordingly, but missed many of them in cow.c. In those cases, we ended up with the pointer itself being read, not the memory it points to. No failure occurred as a result, so it looks like the tests work just fine without faulting in. However, the huge_zeropage tests explicitly check that pages are populated, so those became skipped. Convert all the remaining FORCE_READ() to fault in the mapped page, as was originally intended. This allows the huge_zeropage tests to run again (3 tests in total). Link: https://lkml.kernel.org/r/20260122170224.4056513-5-kevin.brodsky@arm.com Fixes: 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly") Signed-off-by: Kevin Brodsky Acked-by: SeongJae Park Reviewed-by: wang lian Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: Shuah Khan Cc: Usama Anjum Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index accfd198dbda..83b3563be26b 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1612,8 +1612,8 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1663,8 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: @@ -1719,8 +1719,8 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: @@ -1773,8 +1773,8 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, hugetlbsize); munmap: -- cgit v1.2.3 From 20d3fac43608a1d7ef71991935abc4456baa1da7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:20 +0000 Subject: selftests/mm: check that FORCE_READ() succeeded Many cow tests rely on FORCE_READ() to populate pages. Introduce a helper to make sure that the pages are actually populated, and fail otherwise. Link: https://lkml.kernel.org/r/20260122170224.4056513-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Suggested-by: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 43 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 83b3563be26b..d9c69c04b67d 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -75,6 +75,18 @@ static bool range_is_swapped(void *addr, size_t size) return true; } +static bool populate_page_checked(char *addr) +{ + bool ret; + + FORCE_READ(*addr); + ret = pagemap_is_populated(pagemap_fd, addr); + if (!ret) + ksft_print_msg("Failed to populate page\n"); + + return ret; +} + struct comm_pipes { int child_ready[2]; int parent_ready[2]; @@ -1549,8 +1561,10 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1612,8 +1626,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1680,10 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1719,8 +1738,10 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1773,8 +1794,10 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, hugetlbsize); munmap: -- cgit v1.2.3 From dd2b4e04c09808ff921e3460a608537d1a94595d Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:21 +0000 Subject: selftests/mm: introduce helper to read every page FORCE_READ(*addr) ensures that the compiler will emit a load from addr. Several tests need to trigger such a load for a range of pages, ensuring that every page is faulted in, if it wasn't already. Introduce a new helper force_read_pages() that does exactly that and replace existing loops with a call to it. The step size (regular/huge page size) is preserved for all loops, except in split_huge_page_test. Reading every byte is unnecessary; we now read every huge page, matching the following call to check_huge_file(). Link: https://lkml.kernel.org/r/20260122170224.4056513-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Acked-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 9 +-------- tools/testing/selftests/mm/pfnmap.c | 9 +++------ tools/testing/selftests/mm/split_huge_page_test.c | 6 +----- tools/testing/selftests/mm/vm_util.h | 7 +++++++ 4 files changed, 12 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 05d9d2805ae4..5b12041fa310 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -47,14 +47,7 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i; - - for (i = 0; i < nr_pages; i++) { - unsigned long *addr2 = - ((unsigned long *)(addr + (i * huge_page_size))); - /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(*addr2); - } + force_read_pages(addr, nr_pages, huge_page_size); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index f546dfb10cae..45b5f1cf6019 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -35,18 +35,15 @@ static void signal_handler(int sig) static int test_read_access(char *addr, size_t size, size_t pagesize) { - size_t offs; int ret; if (signal(SIGSEGV, signal_handler) == SIG_ERR) return -EINVAL; ret = sigsetjmp(sigjmp_buf_env, 1); - if (!ret) { - for (offs = 0; offs < size; offs += pagesize) - /* Force a read that the compiler cannot optimize out. */ - *((volatile char *)(addr + offs)); - } + if (!ret) + force_read_pages(addr, size/pagesize, pagesize); + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) return -EINVAL; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 40799f3f0213..e0167111bdd1 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -652,11 +652,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) { - char *addr2 = *addr + i; - - FORCE_READ(*addr2); - } + force_read_pages(*addr, fd_size / pmd_pagesize, pmd_pagesize); if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6ad32b1830f1..522f7f9050f5 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -54,6 +54,13 @@ static inline unsigned int pshift(void) return __page_shift; } +static inline void force_read_pages(char *addr, unsigned int nr_pages, + size_t pagesize) +{ + for (unsigned int i = 0; i < nr_pages; i++) + FORCE_READ(addr[i * pagesize]); +} + bool detect_huge_zeropage(void); /* -- cgit v1.2.3 From 7e938f00b00319510ae097e20b7487dfa578d53f Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:22 +0000 Subject: selftests/mm: fix faulting-in code in pagemap_ioctl test One of the pagemap_ioctl tests attempts to fault in pages by memcpy()'ing them to an unused buffer. This probably worked originally, but since commit 46036188ea1f ("selftests/mm: build with -O2") the compiler is free to optimise away that unused buffer and the memcpy() with it. As a result there might not be any resident page in the mapping and the test may fail. We don't need to copy all that memory anyway. Just fault in every page. While at it also make sure to compute the number of pages once using simple integer arithmetic instead of ceilf() and implicit conversions. Link: https://lkml.kernel.org/r/20260122170224.4056513-8-kevin.brodsky@arm.com Fixes: 46036188ea1f ("selftests/mm: build with -O2") Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 2cb5441f29c7..1896c7d4f72e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1052,11 +1052,10 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - long ret, fd, i, buf_size; + long ret, fd, i, buf_size, nr_pages; struct page_region *vec; char *mem, *fmem; struct stat sbuf; - char *tmp_buf; /* 1. wrong operation */ mem_size = 10 * page_size; @@ -1167,14 +1166,14 @@ int sanity_tests(void) if (fmem == MAP_FAILED) ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); - tmp_buf = malloc(sbuf.st_size); - memcpy(tmp_buf, fmem, sbuf.st_size); + nr_pages = (sbuf.st_size + page_size - 1) / page_size; + force_read_pages(fmem, nr_pages, page_size); ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS); ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && - LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) && + LEN(vec[0]) == nr_pages && (vec[0].categories & PAGE_IS_FILE), "%s Memory mapped file\n", __func__); -- cgit v1.2.3 From 148e5879532f835118e00c3040acef077b57721a Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:23 +0000 Subject: selftests/mm: fix exit code in pagemap_ioctl Make sure pagemap_ioctl exits with an appropriate value: * If the tests are run, call ksft_finished() to report the right status instead of reporting PASS unconditionally. * Report SKIP if userfaultfd isn't available (in line with other tests) * Report FAIL if we failed to open /proc/self/pagemap, as this file has been added a long time ago and doesn't depend on any CONFIG option (returning -EINVAL from main() is meaningless) Link: https://lkml.kernel.org/r/20260122170224.4056513-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Ryan Roberts Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Red Hat) Acked-by: SeongJae Park Reviewed-by: wang lian Reviewed-by: Dev Jain Cc: Usama Anjum Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Paolo Abeni Cc: Shuah Khan Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 1896c7d4f72e..2ca8a7e3c27e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1552,7 +1552,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) ksft_print_header(); if (init_uffd()) - ksft_exit_pass(); + ksft_exit_skip("Failed to initialize userfaultfd\n"); ksft_set_plan(117); @@ -1561,7 +1561,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) pagemap_fd = open(PAGEMAP, O_RDONLY); if (pagemap_fd < 0) - return -EINVAL; + ksft_exit_fail_msg("Failed to open " PAGEMAP "\n"); /* 1. Sanity testing */ sanity_tests_sd(); @@ -1733,5 +1733,5 @@ int main(int __attribute__((unused)) argc, char *argv[]) zeropfn_tests(); close(pagemap_fd); - ksft_exit_pass(); + ksft_finished(); } -- cgit v1.2.3 From fde8353121aa304ee88542f011dd5dc83ced47e4 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:24 +0000 Subject: selftests/mm: report SKIP in pfnmap if a check fails pfnmap currently checks the target file in FIXTURE_SETUP(pfnmap), meaning once for every test, and skips the test if any check fails. The target file is the same for every test so this is a little overkill. More importantly, this approach means that the whole suite will report PASS even if all the tests are skipped because kernel configuration (e.g. CONFIG_STRICT_DEVMEM=y) prevented /dev/mem from being mapped, for instance. Let's ensure that KSFT_SKIP is returned as exit code if any check fails by performing the checks in pfnmap_init(), run once. That function also takes care of finding the offset of the pages to be mapped and saves it in a global. The file is now opened only once and the fd saved in a global, but it is still mapped/unmapped for every test, as some of them modify the mapping. Link: https://lkml.kernel.org/r/20260122170224.4056513-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pfnmap.c | 84 +++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index 45b5f1cf6019..4f550822385a 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -25,8 +25,12 @@ #include "kselftest_harness.h" #include "vm_util.h" +#define DEV_MEM_NPAGES 2 + static sigjmp_buf sigjmp_buf_env; static char *file = "/dev/mem"; +static off_t file_offset; +static int fd; static void signal_handler(int sig) { @@ -88,7 +92,7 @@ static int find_ram_target(off_t *offset, break; /* We need two pages. */ - if (end > start + 2 * pagesize) { + if (end > start + DEV_MEM_NPAGES * pagesize) { fclose(file); *offset = start; return 0; @@ -97,11 +101,48 @@ static int find_ram_target(off_t *offset, return -ENOENT; } +static void pfnmap_init(void) +{ + size_t pagesize = getpagesize(); + size_t size = DEV_MEM_NPAGES * pagesize; + void *addr; + + if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { + int err = find_ram_target(&file_offset, pagesize); + + if (err) + ksft_exit_skip("Cannot find ram target in '/proc/iomem': %s\n", + strerror(-err)); + } else { + file_offset = 0; + } + + fd = open(file, O_RDONLY); + if (fd < 0) + ksft_exit_skip("Cannot open '%s': %s\n", file, strerror(errno)); + + /* + * Make sure we can map the file, and perform some basic checks; skip + * the whole suite if anything goes wrong. + * A fresh mapping is then created for every test case by + * FIXTURE_SETUP(pfnmap). + */ + addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset); + if (addr == MAP_FAILED) + ksft_exit_skip("Cannot mmap '%s': %s\n", file, strerror(errno)); + + if (!check_vmflag_pfnmap(addr)) + ksft_exit_skip("Invalid file: '%s'. Not pfnmap'ed\n", file); + + if (test_read_access(addr, size, pagesize)) + ksft_exit_skip("Cannot read-access mmap'ed '%s'\n", file); + + munmap(addr, size); +} + FIXTURE(pfnmap) { - off_t offset; size_t pagesize; - int dev_mem_fd; char *addr1; size_t size1; char *addr2; @@ -112,31 +153,10 @@ FIXTURE_SETUP(pfnmap) { self->pagesize = getpagesize(); - if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { - /* We'll require two physical pages throughout our tests ... */ - if (find_ram_target(&self->offset, self->pagesize)) - SKIP(return, - "Cannot find ram target in '/proc/iomem'\n"); - } else { - self->offset = 0; - } - - self->dev_mem_fd = open(file, O_RDONLY); - if (self->dev_mem_fd < 0) - SKIP(return, "Cannot open '%s'\n", file); - - self->size1 = self->pagesize * 2; + self->size1 = DEV_MEM_NPAGES * self->pagesize; self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); - if (self->addr1 == MAP_FAILED) - SKIP(return, "Cannot mmap '%s'\n", file); - - if (!check_vmflag_pfnmap(self->addr1)) - SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file); - - /* ... and want to be able to read from them. */ - if (test_read_access(self->addr1, self->size1, self->pagesize)) - SKIP(return, "Cannot read-access mmap'ed '%s'\n", file); + fd, file_offset); + ASSERT_NE(self->addr1, MAP_FAILED); self->size2 = 0; self->addr2 = MAP_FAILED; @@ -148,8 +168,6 @@ FIXTURE_TEARDOWN(pfnmap) munmap(self->addr2, self->size2); if (self->addr1 != MAP_FAILED) munmap(self->addr1, self->size1); - if (self->dev_mem_fd >= 0) - close(self->dev_mem_fd); } TEST_F(pfnmap, madvise_disallowed) @@ -189,7 +207,7 @@ TEST_F(pfnmap, munmap_split) */ self->size2 = self->pagesize; self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); + fd, file_offset); ASSERT_NE(self->addr2, MAP_FAILED); } @@ -259,8 +277,12 @@ int main(int argc, char **argv) if (strcmp(argv[i], "--") == 0) { if (i + 1 < argc && strlen(argv[i + 1]) > 0) file = argv[i + 1]; - return test_harness_run(i, argv); + argc = i; + break; } } + + pfnmap_init(); + return test_harness_run(argc, argv); } -- cgit v1.2.3 From dd2c6ec24fca9235ccd1b9bfd382d0ddb419e41a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 16 Jan 2026 13:20:53 +0000 Subject: selftests/mm: remove virtual_address_range test This self test is asserting internal implementation details and is highly vulnerable to internal kernel changes as a result. It is currently failing locally from at least v6.17, and it seems that it may have been failing for longer in many configurations/hardware as it skips if e.g. CONFIG_ANON_VMA_NAME is not specified. With these skips and the fact that run_vmtests.sh won't run the tests in certain configurations it is likely we have simply missed this test being broken in CI for a long while. I have tried multiple versions of these tests and am unable to find a working bisect as previous versions of the test fail also. The tests are essentially mmap()'ing a series of mappings with no hint and asserting what the get_unmapped_area*() functions will come up with, with seemingly few checks for what other mappings may already be in place. It then appears to be mmap()'ing with a hint, and making a series of similar assertions about the internal implementation details of the hinting logic. Commit 0ef3783d7558 ("selftests/mm: add support to test 4PB VA on PPC64"), commit 3bd6137220bb ("selftests/mm: virtual_address_range: avoid reading from VM_IO mappings"), and especially commit a005145b9c96 ("selftests/mm: virtual_address_range: mmap() without PROT_WRITE") are good examples of the whack-a-mole nature of maintaining this test. The last commit there being particularly pertinent as it was accounting for an internal implementation detail change that really should have no bearing on self-tests, that is commit e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping"). The purpose of the mm self-tests are to assert attributes about the API exposed to users, and to ensure that expectations are met. This test is emphatically not doing this, rather making a series of assumptions about internal implementation details and asserting them. It therefore, sadly, seems that the best course is to remove this test altogether. Link: https://lkml.kernel.org/r/20260116132053.857887-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: SeongJae Park Cc: David Hildenbrand Cc: Liam Howlett Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 - tools/testing/selftests/mm/Makefile | 3 - tools/testing/selftests/mm/run_vmtests.sh | 12 - tools/testing/selftests/mm/virtual_address_range.c | 260 --------------------- 4 files changed, 276 deletions(-) delete mode 100644 tools/testing/selftests/mm/virtual_address_range.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c2a8586e51a1..702e5723c35d 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -32,7 +32,6 @@ uffd-unit-tests uffd-wp-mremap mlock-intersect-test mlock-random-test -virtual_address_range gup_test va_128TBswitch map_fixed_noreplace diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index de4afc34e3b1..2fdb05e5a56a 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -136,9 +136,6 @@ endif ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch -ifneq ($(ARCH),riscv64) -TEST_GEN_FILES += virtual_address_range -endif TEST_GEN_FILES += write_to_hugetlbfs endif diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 2dadbfc6e535..452875db532c 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -399,18 +399,6 @@ CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison fi if [ $VADDR64 -ne 0 ]; then - - # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel - # allows high virtual address allocation requests independent - # of platform's physical memory. - - if [ -x ./virtual_address_range ]; then - prev_policy=$(cat /proc/sys/vm/overcommit_memory) - echo 1 > /proc/sys/vm/overcommit_memory - CATEGORY="hugevm" run_test ./virtual_address_range - echo $prev_policy > /proc/sys/vm/overcommit_memory - fi - # va high address boundary switch test CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh fi # VADDR64 diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c deleted file mode 100644 index 4f0923825ed7..000000000000 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ /dev/null @@ -1,260 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vm_util.h" -#include "kselftest.h" - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 1GB. Hence 1GB is - * chosen as the single chunk size for address space - * mapping. - */ - -#define SZ_1GB (1024 * 1024 * 1024UL) -#define SZ_1TB (1024 * 1024 * 1024 * 1024UL) - -#define MAP_CHUNK_SIZE SZ_1GB - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and support for - * high mappings up to 4PB virtual address space has - * been added. - * - * On PowerPC64, the address space up to 128TB can be - * mapped without a hint. Addresses beyond 128TB, up to - * 4PB, can be mapped with a hint. - * - */ - -#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) -#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) -#define NR_CHUNKS_3968TB (NR_CHUNKS_128TB * 31UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB -#elif defined(__PPC64__) -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3968TB -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hint_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static void validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); - return; - } - - if (addr > HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); -} - -static void mark_range(char *ptr, size_t size) -{ - if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) { - if (errno == EINVAL) { - /* Depends on CONFIG_ANON_VMA_NAME */ - ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n"); - ksft_finished(); - } else { - ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n"); - } - } -} - -static int is_marked_vma(const char *vma_name) -{ - return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n"); -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -static int validate_complete_va_space(void) -{ - unsigned long start_addr, end_addr, prev_end_addr; - char line[400]; - char prot[6]; - FILE *file; - int fd; - - fd = open("va_dump", O_CREAT | O_WRONLY, 0600); - unlink("va_dump"); - if (fd < 0) { - ksft_test_result_skip("cannot create or open dump file\n"); - ksft_finished(); - } - - file = fopen("/proc/self/maps", "r"); - if (file == NULL) - ksft_exit_fail_msg("cannot open /proc/self/maps\n"); - - prev_end_addr = 0; - while (fgets(line, sizeof(line), file)) { - const char *vma_name = NULL; - int vma_name_start = 0; - unsigned long hop; - - if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n", - &start_addr, &end_addr, prot, &vma_name_start) != 3) - ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); - - if (vma_name_start) - vma_name = line + vma_name_start; - - /* end of userspace mappings; ignore vsyscall mapping */ - if (start_addr & (1UL << 63)) - return 0; - - /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ - if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) - return 1; - - prev_end_addr = end_addr; - - if (prot[0] != 'r') - continue; - - if (check_vmflag_io((void *)start_addr)) - continue; - - /* - * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. - * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 - * addresses after that. If the address was not held by this - * process, write would fail with errno set to EFAULT. - * Anyways, if write returns anything apart from 1, exit the - * program since that would mean a bug in /proc/self/maps. - */ - hop = 0; - while (start_addr + hop < end_addr) { - if (write(fd, (void *)(start_addr + hop), 1) != 1) - return 1; - lseek(fd, 0, SEEK_SET); - - if (is_marked_vma(vma_name)) - munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE); - - hop += MAP_CHUNK_SIZE; - } - } - return 0; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char **hptr; - char *hint; - unsigned long i, lchunks, hchunks; - - ksft_print_header(); - ksft_set_plan(1); - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); - break; - } - - mark_range(ptr[i], MAP_CHUNK_SIZE); - validate_addr(ptr[i], 0); - } - lchunks = i; - hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); - if (hptr == NULL) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hint_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - mark_range(hptr[i], MAP_CHUNK_SIZE); - validate_addr(hptr[i], 1); - } - hchunks = i; - if (validate_complete_va_space()) { - ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); - ksft_finished(); - } - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - free(hptr); - - ksft_test_result_pass("Test\n"); - ksft_finished(); -} -- cgit v1.2.3 From 94a62284ede0250e48c886416041ad65907ee917 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:24 -0800 Subject: selftests/damon/sysfs_memcg_path_leak.sh: use kmemleak Patch series "selftests/damon: improve leak detection and wss estimation reliability". Two DAMON selftets, namely 'sysfs_memcg_leak' and 'sysfs_update_schemes_tried_regions_wss_estimation' frequently show intermittent failures due to their unreliable leak detection and working set size estimation. Make those more reliable. This patch (of 5): sysfs_memcg_path_leak.sh determines if the memory leak has happened by seeing if Slab size on /proc/meminfo increases more than expected after an action. Depending on the system and background workloads, the reasonable expectation varies. For the reason, the test frequently shows intermittent failures. Use kmemleak, which is much more reliable and correct, instead. Link: https://lkml.kernel.org/r/20260117020731.226785-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260117020731.226785-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/damon/sysfs_memcg_path_leak.sh | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh index 64c5d8c518a4..33a7ff43ed6c 100755 --- a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh +++ b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh @@ -14,6 +14,13 @@ then exit $ksft_skip fi +kmemleak="/sys/kernel/debug/kmemleak" +if [ ! -f "$kmemleak" ] +then + echo "$kmemleak not found" + exit $ksft_skip +fi + # ensure filter directory echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds" echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts" @@ -22,22 +29,17 @@ echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/nr_filters" filter_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/0" -before_kb=$(grep Slab /proc/meminfo | awk '{print $2}') - -# try to leak 3000 KiB -for i in {1..102400}; +# try to leak 128 times +for i in {1..128}; do echo "012345678901234567890123456789" > "$filter_dir/memcg_path" done -after_kb=$(grep Slab /proc/meminfo | awk '{print $2}') -# expect up to 1500 KiB free from other tasks memory -expected_after_kb_max=$((before_kb + 1500)) - -if [ "$after_kb" -gt "$expected_after_kb_max" ] +echo scan > "$kmemleak" +kmemleak_report=$(cat "$kmemleak") +if [ "$kmemleak_report" = "" ] then - echo "maybe memcg_path are leaking: $before_kb -> $after_kb" - exit 1 -else exit 0 fi +echo "$kmemleak_report" +exit 1 -- cgit v1.2.3 From 891d206e27dc1a684e460b079d2b53e17135d693 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:25 -0800 Subject: selftests/damon/wss_estimation: test for up to 160 MiB working set size DAMON reads and writes Accessed bits of page tables without manual TLB flush for two reasons. First, it minimizes the overhead. Second, real systems that need DAMON are expected to be memory intensive enough to cause periodic TLB flushes. For test setups that use small test workloads, however, the system's TLB could be big enough to cover whole or most accesses of the test workload. In this case, no page table walk happens and DAMON cannot show any access from the test workload. The test workload for DAMON's working set size estimation selftest is such a case. It accesses only 10 MiB working set, and it turned out there are test setups that have TLBs large enough to cover the 10 MiB data accesses. As a result, the test fails depending on the test machine. Make it more reliable by trying larger working sets up to 160 MiB when it fails. Link: https://lkml.kernel.org/r/20260117020731.226785-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- ..._update_schemes_tried_regions_wss_estimation.py | 29 +++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py index 90ad7409a7a6..bf48ef8e5241 100755 --- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py +++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py @@ -6,9 +6,8 @@ import time import _damon_sysfs -def main(): - # access two 10 MiB memory regions, 2 second per each - sz_region = 10 * 1024 * 1024 +def pass_wss_estimation(sz_region): + # access two regions of given size, 2 seocnds per each region proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( contexts=[_damon_sysfs.DamonCtx( @@ -36,20 +35,38 @@ def main(): wss_collected.append( kdamonds.kdamonds[0].contexts[0].schemes[0].tried_bytes) + err = kdamonds.stop() + if err is not None: + print('kdamond stop failed: %s' % err) + exit(1) wss_collected.sort() acceptable_error_rate = 0.2 for percentile in [50, 75]: sample = wss_collected[int(len(wss_collected) * percentile / 100)] error_rate = abs(sample - sz_region) / sz_region - print('%d-th percentile (%d) error %f' % - (percentile, sample, error_rate)) + print('%d-th percentile error %f (expect %d, result %d)' % + (percentile, error_rate, sz_region, sample)) if error_rate > acceptable_error_rate: print('the error rate is not acceptable (> %f)' % acceptable_error_rate) print('samples are as below') print('\n'.join(['%d' % wss for wss in wss_collected])) - exit(1) + return False + return True + +def main(): + # DAMON doesn't flush TLB. If the system has large TLB that can cover + # whole test working set, DAMON cannot see the access. Test up to 160 MiB + # test working set. + sz_region_mb = 10 + max_sz_region_mb = 160 + while sz_region_mb <= max_sz_region_mb: + test_pass = pass_wss_estimation(sz_region_mb * 1024 * 1024) + if test_pass is True: + exit(0) + sz_region_mb *= 2 + exit(1) if __name__ == '__main__': main() -- cgit v1.2.3 From 514d1bcb58e0ef93fafa4f9c3035d604a4219867 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:26 -0800 Subject: selftests/damon/access_memory: add repeat mode 'access_memory' is an artificial memory access generator program that is used for a few DAMON selftests. It accesses a given number of regions one by one only once, and exits. Depending on systems, the test workload may exit faster than expected, making the tests unreliable. For reliable control of the artificial memory access pattern, add a mode to make it repeat running. Link: https://lkml.kernel.org/r/20260117020731.226785-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/access_memory.c | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c index 56b17e8fe1be..567793b11107 100644 --- a/tools/testing/selftests/damon/access_memory.c +++ b/tools/testing/selftests/damon/access_memory.c @@ -8,6 +8,11 @@ #include #include +enum access_mode { + ACCESS_MODE_ONCE, + ACCESS_MODE_REPEAT, +}; + int main(int argc, char *argv[]) { char **regions; @@ -15,10 +20,12 @@ int main(int argc, char *argv[]) int nr_regions; int sz_region; int access_time_ms; + enum access_mode mode = ACCESS_MODE_ONCE; + int i; - if (argc != 4) { - printf("Usage: %s